In [None]:
from sklearn.model_selection import train_test_split
from module import datasets
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder

In [None]:
def train(dataset):
    model = LogisticRegression()

    vectorizer = TfidfVectorizer(max_features=5000)
    X  = vectorizer.fit_transform(dataset['name'])

    encoder = LabelEncoder()
    y = encoder.fit_transform(dataset['main_category'])

    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model.fit(x_train, y_train)

    # Evaluation
    y_pred_labels = model.predict(x_test)

    print(classification_report(y_test, y_pred_labels, target_names=encoder.classes_))

    return model

In [2]:
df = datasets.amazon_full()
df

Unnamed: 0,name,main_category
0,Lloyd 1.5 Ton 3 Star Inverter Split Ac (5 In 1...,appliances
1,LG 1.5 Ton 5 Star AI DUAL Inverter Split AC (C...,appliances
2,LG 1 Ton 4 Star Ai Dual Inverter Split Ac (Cop...,appliances
3,LG 1.5 Ton 3 Star AI DUAL Inverter Split AC (C...,appliances
4,Carrier 1.5 Ton 3 Star Inverter Split AC (Copp...,appliances
...,...,...
1103165,Adidas Regular Fit Men's Track Tops,sports & fitness
1103166,Redwolf Noice Toit Smort - Hoodie (Black),sports & fitness
1103167,Redwolf Schrute Farms B&B - Hoodie (Navy Blue),sports & fitness
1103168,Puma Men Shorts,sports & fitness


In [3]:
# Stratified Sampling: 1 percent proportional sampling
_, df_sample_11k = train_test_split(df, 
                                test_size=0.01, 
                                stratify=df['main_category'], 
                                random_state=42)

df_sample_11k

Unnamed: 0,name,main_category
826541,ASJAR Men's Ultra-Soft Lycra Material | Ice-Si...,men's clothing
846121,EditLook Men's Slim Fit Wash Knee Distressed J...,men's clothing
849057,Pepe Jeans Men's Slim Fit Jeans (PIMC0003498_D...,men's clothing
1009162,Red Tape Men Blue Walking Shoes,men's shoes
808855,"FEDUS USB 3.0 Extension Cable 1.5M, Aluminum A...","tv, audio & cameras"
...,...,...
507626,Vast Aviator Unisex Sunglasses (Polo9739|Brown...,accessories
599650,Tommy Hilfiger Brown & Navy Leather Men's Wall...,accessories
547375,ZUPERIA Diamond Studded Combo Watches for Girl...,accessories
918926,INDIAN FLOWER Cotton Nighty for Women,women's clothing


In [4]:
# Stratified Sampling: 2 percent proportional sampling
_, df_sample_22k = train_test_split(df, 
                                test_size=0.02, 
                                stratify=df['main_category'], 
                                random_state=42)

df_sample_22k

Unnamed: 0,name,main_category
779836,Designer Mobile Pouch for Women Multicolored E...,accessories
289494,WON Brand NL4FC Speakon Cable Amplifier Male C...,"tv, audio & cameras"
50962,STREETMODE - Cotton Sweatshirts Pullovers for ...,sports & fitness
850016,Cherokee by Unlimited Men's Skinny Jeans,men's clothing
931313,Your Gift Studio Customized Slim Genuine Leath...,stores
...,...,...
692443,LooksGud Dhruvi Trendz Women's Cotton Silk Sol...,women's clothing
605649,Personalized Embossed Advocate Key chain,accessories
696013,Generic Women's Linen Regular Short Kurti (Pri...,women's clothing
165953,BIBA womens Suit Set,women's clothing


In [20]:
train(df_sample_11k)

                         precision    recall  f1-score   support

            accessories       0.75      0.96      0.84       464
             appliances       0.84      0.90      0.87       143
         bags & luggage       0.75      0.17      0.27        36
        beauty & health       0.80      0.45      0.58        44
        car & motorbike       0.89      0.25      0.39        32
grocery & gourmet foods       1.00      0.06      0.12        16
         home & kitchen       0.71      0.43      0.53        63
    industrial supplies       1.00      0.06      0.11        17
          kids' fashion       0.71      0.30      0.42        40
         men's clothing       0.89      0.98      0.93       296
            men's shoes       0.78      0.94      0.85       224
                  music       0.00      0.00      0.00         3
           pet supplies       0.00      0.00      0.00        10
       sports & fitness       0.83      0.44      0.58        43
                 stores 

In [21]:
train(df_sample_22k)

                         precision    recall  f1-score   support

            accessories       0.80      0.95      0.87       945
             appliances       0.85      0.97      0.91       272
         bags & luggage       0.60      0.23      0.33        79
        beauty & health       0.79      0.64      0.71        92
        car & motorbike       0.87      0.58      0.69        57
grocery & gourmet foods       1.00      0.57      0.73        28
         home & kitchen       0.76      0.61      0.68       120
    industrial supplies       0.78      0.18      0.30        38
          kids' fashion       0.79      0.55      0.65       103
         men's clothing       0.95      0.98      0.96       645
            men's shoes       0.84      0.92      0.88       430
                  music       0.00      0.00      0.00        13
           pet supplies       1.00      0.44      0.62         9
       sports & fitness       0.77      0.46      0.57       103
                 stores 

In [22]:
df_sample_55k = datasets.amazon_sample()
train(df_sample_55k)

                         precision    recall  f1-score   support

            accessories       0.85      0.94      0.89      2396
             appliances       0.91      0.97      0.94       697
         bags & luggage       0.60      0.40      0.48       210
        beauty & health       0.72      0.79      0.75       193
        car & motorbike       0.87      0.64      0.74       151
grocery & gourmet foods       0.85      0.69      0.76        65
         home & kitchen       0.76      0.76      0.76       313
    industrial supplies       0.93      0.37      0.53        73
          kids' fashion       0.83      0.65      0.73       270
         men's clothing       0.93      0.98      0.95      1477
            men's shoes       0.86      0.93      0.90      1161
                  music       1.00      0.23      0.37        22
           pet supplies       0.90      0.36      0.51        25
       sports & fitness       0.74      0.52      0.61       235
                 stores 

In [23]:
_, df_sample_110k = train_test_split(df, 
                                test_size=0.1, 
                                stratify=df['main_category'], 
                                random_state=42)

df_sample_110k

Unnamed: 0,name,main_category
862042,MORAKHIYA APPARELS MENS WEAR FOR JEANS BLACK F...,men's clothing
401731,LIBONI Colour Stay Kajal - Pack of Twelve,beauty & health
439385,FREDI HD PLUS WiFi 3 Pin Plug Spy Camera,"tv, audio & cameras"
972596,Arrow Men Formal Shirt,men's clothing
107506,GINNI Chamois Leather for Digital Camera Lens ...,"tv, audio & cameras"
...,...,...
111314,The Autostory Waterproof Car Body Cover Compat...,car & motorbike
512622,INDISSH Slipknot01 Band Printed T-Shirt for Men,men's clothing
905044,Jockey Women's Hipster (1406 Prints_XXL_Pack o...,women's clothing
760989,Malabar Gold and Diamonds 18KT Rose Gold and D...,accessories


In [24]:
train(df_sample_110k)

                         precision    recall  f1-score   support

            accessories       0.85      0.95      0.90      4665
             appliances       0.92      0.97      0.94      1312
         bags & luggage       0.69      0.41      0.51       414
        beauty & health       0.85      0.83      0.84       425
        car & motorbike       0.86      0.74      0.80       302
grocery & gourmet foods       0.84      0.80      0.82       115
         home & kitchen       0.74      0.77      0.75       552
    industrial supplies       0.82      0.49      0.62       164
          kids' fashion       0.74      0.67      0.70       497
         men's clothing       0.94      0.98      0.96      3150
            men's shoes       0.86      0.94      0.90      2226
                  music       0.91      0.24      0.38        41
           pet supplies       0.96      0.64      0.77        70
       sports & fitness       0.78      0.59      0.67       517
                 stores 

In [25]:
_, df_sample_220k = train_test_split(df, 
                                test_size=0.2, 
                                stratify=df['main_category'], 
                                random_state=42)

df_sample_220k

Unnamed: 0,name,main_category
778320,AILTINO Multicolor Women Potli Bag combo's and...,accessories
787668,Wireless Bluetooth Headphones Earphones for On...,"tv, audio & cameras"
202411,BATA Remo 825-6722-43 Men's Black Formal Leath...,men's shoes
529976,Satya Paul Leather Black Wallet for Men,stores
672236,Mclothings Women's Ethnic wear Silk n Blue Col...,women's clothing
...,...,...
422619,DREAM STORE - TVS Pep+ New BS6 Water Resistant...,car & motorbike
506954,myaddiction Women Vintage Triangle Mirrored Su...,accessories
647991,Aircum Combo FITMAN Blue & Grey Sneakers for M...,men's shoes
521398,Lee Men T-Shirt,men's clothing


In [26]:
train(df_sample_220k)

                         precision    recall  f1-score   support

            accessories       0.87      0.94      0.91      9410
             appliances       0.93      0.97      0.95      2678
         bags & luggage       0.67      0.42      0.52       812
        beauty & health       0.82      0.84      0.83       778
        car & motorbike       0.88      0.77      0.82       578
grocery & gourmet foods       0.88      0.78      0.83       264
         home & kitchen       0.78      0.77      0.77      1196
    industrial supplies       0.75      0.50      0.60       296
          kids' fashion       0.79      0.67      0.72      1077
         men's clothing       0.94      0.98      0.96      6087
            men's shoes       0.87      0.94      0.90      4594
                  music       1.00      0.44      0.62        99
           pet supplies       0.93      0.82      0.87       109
       sports & fitness       0.76      0.63      0.69      1046
                 stores 

In [27]:
_, df_sample_550k = train_test_split(df, 
                                test_size=0.5, 
                                stratify=df['main_category'], 
                                random_state=42)

df_sample_550k

Unnamed: 0,name,main_category
722863,Vector X IR-97132 Jump Ball Bearing Skipping Rope,sports & fitness
308347,Jockey Men U Neck Sleeveless Vest - Pack of 3 ...,men's clothing
414433,Fastrack Analog Blue Dial Unisex's Watch-38037...,stores
1090848,NAINVISH Women/Girls Trending Printed Kaftan T...,women's clothing
813253,Astigo Compatible Remote for Tata Sky DTH Univ...,"tv, audio & cameras"
...,...,...
132919,Mochi Men's Sneakers (71-8621),men's shoes
843242,Amazon Brand - Inkast Denim Co. Men's Jogger J...,men's clothing
553792,NEUTRON Present Digital and Analog White and S...,accessories
781541,SHINDE EXPORTS Women's Sling Bag,accessories


In [28]:
train(df_sample_550k)

                         precision    recall  f1-score   support

            accessories       0.87      0.95      0.91     23100
             appliances       0.94      0.97      0.95      6622
         bags & luggage       0.67      0.43      0.52      2090
        beauty & health       0.83      0.85      0.84      2016
        car & motorbike       0.86      0.80      0.83      1407
grocery & gourmet foods       0.87      0.83      0.85       660
         home & kitchen       0.79      0.79      0.79      3001
    industrial supplies       0.78      0.60      0.68       818
          kids' fashion       0.79      0.69      0.74      2665
         men's clothing       0.95      0.98      0.97     15388
            men's shoes       0.87      0.95      0.91     11564
                  music       0.87      0.57      0.69       217
           pet supplies       0.96      0.85      0.90       324
       sports & fitness       0.74      0.62      0.68      2531
                 stores 

In [29]:
train(df)

                         precision    recall  f1-score   support

            accessories       0.88      0.95      0.91     46242
             appliances       0.94      0.97      0.95     13209
         bags & luggage       0.70      0.45      0.55      4248
        beauty & health       0.85      0.85      0.85      4023
        car & motorbike       0.86      0.81      0.84      2818
grocery & gourmet foods       0.83      0.88      0.85      1337
         home & kitchen       0.79      0.79      0.79      5774
    industrial supplies       0.77      0.61      0.68      1624
          kids' fashion       0.79      0.70      0.74      5367
         men's clothing       0.95      0.98      0.97     30696
            men's shoes       0.87      0.94      0.91     23077
                  music       0.87      0.56      0.68       400
           pet supplies       0.91      0.88      0.89       646
       sports & fitness       0.75      0.64      0.69      4969
                 stores 