In [15]:
# Basic libraries
import pandas as pd
import matplotlib.pyplot as plt

# Classification models
from sklearn.linear_model import LogisticRegression, RidgeClassifier, PassiveAggressiveClassifier, Perceptron, SGDClassifier
from sklearn.svm import SVC, NuSVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, VotingClassifier, StackingClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB, ComplementNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis, LinearDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier

# Vectorizers
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# Utilities and metrics
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

## Exploração do Dataset

In [16]:
path = "../data/multiclass/"
df = pd.read_csv(path + "Amazon-Products.csv")

In [17]:
df.info()
columns = [
    'name',
    'sub_category'
]
df = df[columns]
df.dropna(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 551585 entries, 0 to 551584
Data columns (total 10 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   Unnamed: 0      551585 non-null  int64 
 1   name            551585 non-null  object
 2   main_category   551585 non-null  object
 3   sub_category    551585 non-null  object
 4   image           551585 non-null  object
 5   link            551585 non-null  object
 6   ratings         375791 non-null  object
 7   no_of_ratings   375791 non-null  object
 8   discount_price  490422 non-null  object
 9   actual_price    533772 non-null  object
dtypes: int64(1), object(9)
memory usage: 42.1+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 551585 entries, 0 to 551584
Data columns (total 2 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   name          551585 non-null  object
 1   sub_category  551585 non-null  object
dtypes: object(2

## Preprocessamento do Dataset

In [18]:
df = df.groupby('sub_category').apply(lambda x: x.sample(frac=0.01)).reset_index(drop=True)
df.dropna(inplace=True)
df['sub_category'].value_counts()

  df = df.groupby('sub_category').apply(lambda x: x.sample(frac=0.01)).reset_index(drop=True)


sub_category
Formal Shoes                        192
Clothing                            192
Bags & Luggage                      192
Jeans                               192
Sports Shoes                        192
                                   ... 
All Pet Supplies                      6
Industrial & Scientific Supplies      6
Home Audio & Theater                  4
Cardio Equipment                      2
Value Bazaar                          1
Name: count, Length: 107, dtype: int64

## Divisão do Dataset em treino e teste

In [20]:
features = df['name']
target = df['sub_category']

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=421)

## Definição dos vetorizadores, modelos e parâmetros para o GridSearch

In [21]:
vectorizers = [
    TfidfVectorizer(),
    CountVectorizer()
]

param_grids = [
    {
        'vectorizer': vectorizers,
        'classifier': [SVC()],
        'classifier__C': [1, 10],
        'classifier__kernel': ['linear', 'rbf']
    },
    {
        'vectorizer': vectorizers,
        'classifier': [MultinomialNB()],
        'classifier__alpha': [0.01, 1.0]
    },
    {
        'vectorizer': vectorizers,
        'classifier': [RandomForestClassifier()],
        'classifier__n_estimators': [100, 150, 200],
        'classifier__max_depth': [None, 10, 20]
    },
    {
        'vectorizer': vectorizers,
        'classifier': [LogisticRegression(max_iter=1000)],
        'classifier__C': [1, 10],
        'classifier__penalty': ['l2']
    },
    {
        'vectorizer': vectorizers,
        'classifier': [KNeighborsClassifier()],
        'classifier__n_neighbors': [3, 5, 7]
    },
    {
        'vectorizer': vectorizers,
        'classifier': [DecisionTreeClassifier()],
        'classifier__max_depth': [None, 10, 20]
    }
]

pipeline = Pipeline([
    ('vectorizer', 'passthrough'),
    ('classifier', 'passthrough')
])

## Execução do GridSearch

In [22]:
grid_search = GridSearchCV(pipeline, param_grids, cv=5, n_jobs=4, verbose=3, scoring='accuracy')

grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 46 candidates, totalling 230 fits


## Resultados finais e matriz de confusão

In [23]:
print(f"Best parameters found: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_}")

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

print("\nTest set classification report with best model:")
print(classification_report(y_test, y_pred, digits=4))

Best parameters found: {'classifier': SVC(), 'classifier__C': 1, 'classifier__kernel': 'linear', 'vectorizer': TfidfVectorizer()}
Best cross-validation score: 0.5980046378687375

Test set classification report with best model:
                                          precision    recall  f1-score   support

                        Air Conditioners     0.0000    0.0000    0.0000         2
                          All Appliances     0.2308    0.4286    0.3000        14
            All Car & Motorbike Products     0.0000    0.0000    0.0000         5
                         All Electronics     0.3103    0.6429    0.4186        14
                  All Exercise & Fitness     0.0000    0.0000    0.0000         1
             All Grocery & Gourmet Foods     0.0000    0.0000    0.0000         4
          All Sports, Fitness & Outdoors     0.0000    0.0000    0.0000         3
                          Amazon Fashion     0.0000    0.0000    0.0000         5
              Baby Bath, Skin & Gr

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# cm = confusion_matrix(y_test, y_pred)
# disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=best_model.classes_)
# #plot confusion matrix with more than 100 labels
# plt.figure(figsize=(20, 20))
# disp.plot(xticks_rotation='vertical')
# plt.show()