In [13]:
# Basic libraries
import pandas as pd
import matplotlib.pyplot as plt
import time

# Classification models
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.naive_bayes import MultinomialNB

# Vectorizers
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# Utilities and metrics
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

# Preprocessing
import nltk
import re

# Download nltk resources
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Rafael\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Exploração do Dataset

In [14]:
path = "../data/multiclass/"
df = pd.read_csv(path + "Amazon-Products.csv")
df = df[['name','sub_category']]
df

Unnamed: 0,name,sub_category
0,Lloyd 1.5 Ton 3 Star Inverter Split Ac (5 In 1...,Air Conditioners
1,LG 1.5 Ton 5 Star AI DUAL Inverter Split AC (C...,Air Conditioners
2,LG 1 Ton 4 Star Ai Dual Inverter Split Ac (Cop...,Air Conditioners
3,LG 1.5 Ton 3 Star AI DUAL Inverter Split AC (C...,Air Conditioners
4,Carrier 1.5 Ton 3 Star Inverter Split AC (Copp...,Air Conditioners
...,...,...
551580,Adidas Regular Fit Men's Track Tops,Yoga
551581,Redwolf Noice Toit Smort - Hoodie (Black),Yoga
551582,Redwolf Schrute Farms B&B - Hoodie (Navy Blue),Yoga
551583,Puma Men Shorts,Yoga


In [15]:
# df = df.groupby('sub_category').apply(lambda x: x.sample(frac=0.005)).reset_index(drop=True)
df = df.groupby('sub_category').head(50)

In [16]:

print(df.info())
print(df['sub_category'].value_counts())

<class 'pandas.core.frame.DataFrame'>
Index: 5514 entries, 0 to 550530
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   name          5514 non-null   object
 1   sub_category  5514 non-null   object
dtypes: object(2)
memory usage: 129.2+ KB
None
sub_category
Air Conditioners                50
All Appliances                  50
All Car & Motorbike Products    50
All Electronics                 50
All Exercise & Fitness          50
                                ..
STEM Toys Store                 48
Fashion Sales & Deals           44
International Toy Store         24
Refurbished & Open Box          24
Toys Gifting Store              24
Name: count, Length: 112, dtype: int64


## Preprocessamento do Dataset

In [17]:
stop_words = set(nltk.corpus.stopwords.words('english'))
lemmatizer = nltk.stem.WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()

    text = re.sub(r'\d+', '', text) # remove digits
    text = re.sub(r'[^\w\s]', '', text)
    
    words = text.split()
    words = [word for word in words if word not in stop_words]
    words = [lemmatizer.lemmatize(word) for word in words]

    return ' '.join(words)

df['name'] = df['name'].apply(preprocess_text)
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['name'] = df['name'].apply(preprocess_text)


Unnamed: 0,name,sub_category
0,lloyd ton star inverter split ac convertible c...,Air Conditioners
1,lg ton star ai dual inverter split ac copper s...,Air Conditioners
2,lg ton star ai dual inverter split ac copper s...,Air Conditioners
3,lg ton star ai dual inverter split ac copper s...,Air Conditioners
4,carrier ton star inverter split ac copperester...,Air Conditioners
...,...,...
550526,devma narains packaging green yoga mat mm thic...,Yoga
550527,campus sutra men solid black drifit zip active...,Yoga
550528,boster premium mm extra thick large foot eva e...,Yoga
550529,amazonbasics yoga exercise mat carrying strap mm,Yoga


## Divisão do Dataset em treino e teste

In [18]:
features = df['name']
target = df['sub_category']

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=421)

## Definição dos vetorizadores, modelos e parâmetros para o GridSearch

In [19]:
vectorizers = [
    TfidfVectorizer(),
    CountVectorizer()
]

param_grids = [
    {
        'vectorizer': vectorizers,
        'classifier': [RandomForestClassifier()],
        'classifier__n_estimators': [100, 150, 200],
        'classifier__max_depth': [None, 10, 20]
    },
    {
        'vectorizer': vectorizers,
        'classifier': [SVC()],
        'classifier__C': [1, 10],
        'classifier__kernel': ['linear', 'rbf']
    },
    {
        'vectorizer': vectorizers,
        'classifier': [MultinomialNB()],
        'classifier__alpha': [0.01, 1.0]
    },
    {
        'vectorizer': vectorizers,
        'classifier': [LogisticRegression(max_iter=1000)],
        'classifier__C': [1, 10],
    },
    {
        'vectorizer': vectorizers,
        'classifier': [KNeighborsClassifier()],
        'classifier__n_neighbors': [3, 5, 7]
    },
    {
        'vectorizer': vectorizers,
        'classifier': [DecisionTreeClassifier()],
        'classifier__max_depth': [None, 10, 20]
    },
    {
        'vectorizer': vectorizers,
        'classifier': [GradientBoostingClassifier()],
        'classifier__n_estimators': [100, 150, 200],
        'classifier__max_depth': [3, 5, 7]
    },
    {
        'vectorizer': vectorizers,
        'classifier': [AdaBoostClassifier()],
        'classifier__n_estimators': [50, 100, 150]
    },
    {
        'vectorizer': vectorizers,
        'classifier': [SGDClassifier()],
        'classifier__alpha': [0.0001, 0.001]
    }
]

pipeline = Pipeline([
    ('vectorizer', 'passthrough'),
    ('classifier', 'passthrough')
])

## Execução do GridSearch

In [20]:
grid_search = GridSearchCV(pipeline, param_grids, cv=5, n_jobs=6, verbose=2, scoring='accuracy')

grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 74 candidates, totalling 370 fits


## Resultados finais e matriz de confusão

In [21]:
cv_results = grid_search.cv_results_

results_df = pd.DataFrame(cv_results)

results_df.to_csv('results/sklearn_multiclass2_results.csv', sep=';', index=False)

In [22]:
print(f"Best parameters found: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_}")

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

print("\nTest set classification report with best model:")
print(classification_report(y_test, y_pred, digits=4))

Best parameters found: {'classifier': SVC(), 'classifier__C': 1, 'classifier__kernel': 'linear', 'vectorizer': TfidfVectorizer()}
Best cross-validation score: 0.5629193919923575

Test set classification report with best model:
                                          precision    recall  f1-score   support

                        Air Conditioners     0.9000    1.0000    0.9474         9
                          All Appliances     0.1176    0.3333    0.1739         6
            All Car & Motorbike Products     0.1429    0.1000    0.1176        10
                         All Electronics     0.3000    0.5000    0.3750         6
                  All Exercise & Fitness     0.3000    0.3333    0.3158         9
             All Grocery & Gourmet Foods     0.3750    0.2727    0.3158        11
                      All Home & Kitchen     0.1538    0.1818    0.1667        11
                        All Pet Supplies     0.3636    0.5714    0.4444         7
          All Sports, Fitness & Ou

In [23]:
import pickle
# Save the model with pickle
filename = 'models/sklearn_multiclass2_model.pkl'
with open(filename, 'wb') as file:
    pickle.dump(best_model, file)