Data sampling 11k 22k 55k

Logistic Regression model training with sample data11k 22k 55k

Data summarizing 11k 22k 55k

Logistic Regression model training with sample + summarized data 11k 22k 55k

In [1]:
from sklearn.model_selection import train_test_split
from module import datasets
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
import pickle

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def train(dataset):
    model = LogisticRegression()

    vectorizer = TfidfVectorizer(max_features=5000)
    X  = vectorizer.fit_transform(dataset['name'])

    encoder = LabelEncoder()
    y = encoder.fit_transform(dataset['main_category'])

    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model.fit(x_train, y_train)

    # Evaluation
    y_pred_labels = model.predict(x_test)

    print(classification_report(y_test, y_pred_labels, target_names=encoder.classes_))

    return model

In [3]:
df = datasets.amazon_full()

In [4]:
_, df_sample_11k = train_test_split(df, 
                                test_size=0.01, 
                                stratify=df['main_category'], 
                                random_state=42)

df_sample_11k
df_sample_11k.to_csv("df_11k.csv", index=False)

model = train(df_sample_11k)

with open('lr_11k.pkl', 'wb') as f:
    pickle.dump(model, f)

                         precision    recall  f1-score   support

            accessories       0.75      0.96      0.84       464
             appliances       0.84      0.90      0.87       143
         bags & luggage       0.75      0.17      0.27        36
        beauty & health       0.80      0.45      0.58        44
        car & motorbike       0.89      0.25      0.39        32
grocery & gourmet foods       1.00      0.06      0.12        16
         home & kitchen       0.71      0.43      0.53        63
    industrial supplies       1.00      0.06      0.11        17
          kids' fashion       0.71      0.30      0.42        40
         men's clothing       0.89      0.98      0.93       296
            men's shoes       0.78      0.94      0.85       224
                  music       0.00      0.00      0.00         3
           pet supplies       0.00      0.00      0.00        10
       sports & fitness       0.83      0.44      0.58        43
                 stores 

In [5]:
_, df_sample_22k = train_test_split(df, 
                                test_size=0.02, 
                                stratify=df['main_category'], 
                                random_state=42)

df_sample_22k
df_sample_22k.to_csv("df_22k.csv", index=False)

model = train(df_sample_22k)

with open('lr_22k.pkl', 'wb') as f:
    pickle.dump(model, f)

                         precision    recall  f1-score   support

            accessories       0.80      0.95      0.87       945
             appliances       0.85      0.97      0.91       272
         bags & luggage       0.60      0.23      0.33        79
        beauty & health       0.79      0.64      0.71        92
        car & motorbike       0.87      0.58      0.69        57
grocery & gourmet foods       1.00      0.57      0.73        28
         home & kitchen       0.76      0.61      0.68       120
    industrial supplies       0.78      0.18      0.30        38
          kids' fashion       0.79      0.55      0.65       103
         men's clothing       0.95      0.98      0.96       645
            men's shoes       0.84      0.92      0.88       430
                  music       0.00      0.00      0.00        13
           pet supplies       1.00      0.44      0.62         9
       sports & fitness       0.77      0.46      0.57       103
                 stores 

In [6]:
_, df_sample_55k = train_test_split(df, 
                                test_size=0.05, 
                                stratify=df['main_category'], 
                                random_state=42)

df_sample_55k
df_sample_55k.to_csv("df_55k.csv", index=False)

model = train(df_sample_55k)

with open('lr_55k.pkl', 'wb') as f:
    pickle.dump(model, f)

                         precision    recall  f1-score   support

            accessories       0.85      0.94      0.89      2396
             appliances       0.91      0.97      0.94       697
         bags & luggage       0.60      0.40      0.48       210
        beauty & health       0.72      0.79      0.75       193
        car & motorbike       0.87      0.64      0.74       151
grocery & gourmet foods       0.85      0.69      0.76        65
         home & kitchen       0.76      0.76      0.76       313
    industrial supplies       0.93      0.37      0.53        73
          kids' fashion       0.83      0.65      0.73       270
         men's clothing       0.93      0.98      0.95      1477
            men's shoes       0.86      0.93      0.90      1161
                  music       1.00      0.23      0.37        22
           pet supplies       0.90      0.36      0.51        25
       sports & fitness       0.74      0.52      0.61       235
                 stores 

In [8]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small").to(device)

def simplify_sentence(sentence):
    input_text = "summarize: " + sentence
    inputs = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True).to(device)
    outputs = model.generate(inputs, max_length=10, num_beams=4, early_stopping=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

cuda


In [9]:
names = df_sample_11k["name"].tolist()
summarized = list(map(simplify_sentence, names))
# Len of tokens
print(len(" ".join(names)))
print(len(" ".join(summarized)))
df_sample_11k["name"] = summarized
df_sample_11k.to_csv("df_11k_summarized.csv", index=False)

883562
326175


In [10]:
names = df_sample_22k["name"].tolist()
summarized = list(map(simplify_sentence, names))
# Len of tokens
print(len(" ".join(names)))
print(len(" ".join(summarized)))
df_sample_22k["name"] = summarized
df_sample_22k.to_csv("df_22k_summarized.csv", index=False)

1774129
652379


In [11]:
names = df_sample_55k["name"].tolist()
summarized = list(map(simplify_sentence, names))
# Len of tokens
print(len(" ".join(names)))
print(len(" ".join(summarized)))
df_sample_55k["name"] = summarized
df_sample_55k.to_csv("df_55k_summarized.csv", index=False)

4424431
1628690


In [12]:
model = train(df_sample_11k)

with open('lr_11k_summarized.pkl', 'wb') as f:
    pickle.dump(model, f)

                         precision    recall  f1-score   support

            accessories       0.59      0.90      0.72       464
             appliances       0.80      0.66      0.72       143
         bags & luggage       0.75      0.08      0.15        36
        beauty & health       1.00      0.14      0.24        44
        car & motorbike       0.83      0.16      0.26        32
grocery & gourmet foods       0.00      0.00      0.00        16
         home & kitchen       0.71      0.08      0.14        63
    industrial supplies       0.00      0.00      0.00        17
          kids' fashion       0.60      0.30      0.40        40
         men's clothing       0.83      0.89      0.86       296
            men's shoes       0.75      0.82      0.78       224
                  music       0.00      0.00      0.00         3
           pet supplies       0.00      0.00      0.00        10
       sports & fitness       0.83      0.23      0.36        43
                 stores 

In [13]:
model = train(df_sample_22k)

with open('lr_22k_summarized.pkl', 'wb') as f:
    pickle.dump(model, f)

                         precision    recall  f1-score   support

            accessories       0.65      0.89      0.75       945
             appliances       0.74      0.76      0.75       272
         bags & luggage       0.90      0.11      0.20        79
        beauty & health       0.94      0.36      0.52        92
        car & motorbike       0.89      0.30      0.45        57
grocery & gourmet foods       0.83      0.18      0.29        28
         home & kitchen       0.52      0.23      0.31       120
    industrial supplies       1.00      0.03      0.05        38
          kids' fashion       0.77      0.43      0.55       103
         men's clothing       0.88      0.91      0.89       645
            men's shoes       0.78      0.84      0.81       430
                  music       0.00      0.00      0.00        13
           pet supplies       1.00      0.22      0.36         9
       sports & fitness       0.90      0.25      0.39       103
                 stores 

In [14]:
model = train(df_sample_55k)

with open('lr_55k_summarized.pkl', 'wb') as f:
    pickle.dump(model, f)

                         precision    recall  f1-score   support

            accessories       0.74      0.89      0.81      2396
             appliances       0.78      0.81      0.79       697
         bags & luggage       0.69      0.28      0.40       210
        beauty & health       0.66      0.36      0.47       193
        car & motorbike       0.79      0.36      0.50       151
grocery & gourmet foods       0.88      0.35      0.51        65
         home & kitchen       0.60      0.38      0.46       313
    industrial supplies       0.62      0.14      0.22        73
          kids' fashion       0.78      0.56      0.65       270
         men's clothing       0.86      0.92      0.89      1477
            men's shoes       0.81      0.88      0.85      1161
                  music       1.00      0.05      0.09        22
           pet supplies       1.00      0.16      0.28        25
       sports & fitness       0.69      0.31      0.43       235
                 stores 