# Classical ML Model Classification Pipeline

This Notebook combines Text vectorization (TF-IDF/CountVectorizer/ Bag Of Words) and classical ML-Models (Logistic Regression / SVM / XGBoost / Random Forest) to run a Classification on the cleaned and translated product texts (titles + descriptions)

## Import libs & read data

In [12]:
"""Initialising the DeepL result DataFrame
This script reads a CSV file containing the results of a DeepL translation analysis
and loads it into a pandas DataFrame for further processing."""

import pandas as pd
import unicodedata
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score
from xgboost import XGBClassifier  
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# File path to the CSV
file_path = "language_analysis/deepL_result.csv"

# Read the CSV into a DataFrame
df_deepl = pd.read_csv(file_path)

# Display the first few rows of the DataFrame (optional)
print(df_deepl.head())

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/robertwilson/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


    productid     imageid prdtypecode bool_description  \
0  3804725264  1263597046          10                0   
1  3804725264  1263597046          10                0   
2   436067568  1008141237        2280                0   
3   201115110   938777978          50                1   
4    50418756   457047496        1280                0   

                                         merged_text lang  \
0  Olivia: Personalisiertes Notizbuch / 150 Seite...   de   
1  Olivia: Personalisiertes Notizbuch / 150 Seite...   de   
2  Journal Des Arts (Le) N° 133 Du 28/09/2001 - L...   fr   
3  PILOT STYLE Touch Pen de marque Speedlink est ...   fr   
4  Peluche Donald - Europe - Disneyland 2000 (Mar...   fr   

                                   deepL_translation  
0  Olivia : Carnet de notes personnalisé / 150 pa...  
1  Olivia : Carnet de notes personnalisé / 150 pa...  
2                                                NaN  
3                                                NaN  
4        

## Process DataFrame

In [13]:
"""The DataFrame is restructured to include only the relevant columns for further processing.
The column "prdtypecode" is retained. A new "text" column is created, which contains the DeepL translation if available.
If the translation is empty, the original text from "merged_text" is used instead."""

# Keep only the columns "prdtypecode", "deepL_translation", and "merged_text"
df_deepl = df_deepl[["prdtypecode", "deepL_translation", "merged_text"]]

# Create the "text" column: use "deepL_translation" if not empty, otherwise use "merged_text"
df_deepl["text"] = df_deepl.apply(
    lambda row: row["deepL_translation"] if pd.notna(row["deepL_translation"]) and row["deepL_translation"].strip() != "" else row["merged_text"],
    axis=1
)

# Drop the now-unnecessary columns "deepL_translation" and "merged_text"
df_deepl = df_deepl[["prdtypecode", "text"]]

# Display the first few rows of the updated DataFrame (optional)
print(df_deepl.head())


  prdtypecode                                               text
0          10  Olivia : Carnet de notes personnalisé / 150 pa...
1          10  Olivia : Carnet de notes personnalisé / 150 pa...
2        2280  Journal Des Arts (Le) N° 133 Du 28/09/2001 - L...
3          50  PILOT STYLE Touch Pen de marque Speedlink est ...
4        1280  Peluche Donald - Europe - Disneyland 2000 (Mar...


In [15]:
print(df_deepl.shape)

(65523, 2)


## Feature & Target Data Split

In [4]:
# Split the DataFrame into features (X) and target (y)
X = df_deepl[["text"]]  # Feature DataFrame containing only the "text" column
y = df_deepl["prdtypecode"]  # Target Series containing the "prdtypecode" column

# Optional: Check the first few rows of X and y
print(X.head())
print(y.head())

                                                text
0  Olivia : Carnet de notes personnalisé / 150 pa...
1  Olivia : Carnet de notes personnalisé / 150 pa...
2  Journal Des Arts (Le) N° 133 Du 28/09/2001 - L...
3  PILOT STYLE Touch Pen de marque Speedlink est ...
4  Peluche Donald - Europe - Disneyland 2000 (Mar...
0      10
1      10
2    2280
3      50
4    1280
Name: prdtypecode, dtype: object


## Preprocessing & Test / Train Split

In [5]:
"""CountVectorizer automatically tokenizes the text, no custom tokenization has been applied."""


# Step 1: Preprocessing 
# Function to normalize accented characters
def normalize_text(text):
    # Convert accented characters to their base forms
    return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')

# Apply normalization to the "text" column
X["text"] = X["text"].apply(normalize_text)

# Step 2: Split the data
X_train, X_test, y_train, y_test = train_test_split(X["text"], y, test_size=0.2, random_state=42)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["text"] = X["text"].apply(normalize_text)


## First Pipeline: Only CountVectorizer, LogReg & GridSearch

In [None]:
# 46m 8.0s 18.04.2025 
# Define the pipeline
pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),  # Step 1: Vectorization
    ('model', LogisticRegression())    # Step 2: Logistic Regression
])

# Define the parameter grid
param_grid = {
    # Parameters for CountVectorizer
    'vectorizer__ngram_range': [(1, 1), (1, 2)],  # Unigrams or unigrams + bigrams
    'vectorizer__max_df': [0.8, 0.9, 1.0],       # Filter very frequent words
    'vectorizer__min_df': [1, 5, 10],            # Filter very rare words
    'vectorizer__max_features': [5000, 10000],   # Limit vocabulary size

    # Parameters for LogisticRegression
    'model__C': [0.1, 1, 10],                    # Regularization strength
    'model__penalty': ['l2'],                    # Regularization type
    'model__solver': ['lbfgs']                   # Solver for Logistic Regression
}

# Perform grid search
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best parameters and model evaluation
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Score:", grid_search.best_score_)

# Evaluate on the test set
y_pred = grid_search.best_estimator_.predict(X_test)
print(classification_report(y_test, y_pred))

Fitting 5 folds for each of 108 candidates, totalling 540 fits


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best Parameters: {'model__C': 10, 'model__penalty': 'l2', 'model__solver': 'lbfgs', 'vectorizer__max_df': 0.8, 'vectorizer__max_features': 10000, 'vectorizer__min_df': 10, 'vectorizer__ngram_range': (1, 1)}
Best Cross-Validation Score: 0.8685567370699525
              precision    recall  f1-score   support

          10       0.71      0.82      0.76       794
        1140       0.86      0.86      0.86       587
        1160       0.96      0.96      0.96      1099
        1180       0.79      0.77      0.78       193
        1280       0.82      0.80      0.81       681
        1281       0.84      0.78      0.81       390
        1300       0.92      0.93      0.92       473
        1301       0.97      0.91      0.94       111
        1302       0.86      0.80      0.83       254
        1320       0.85      0.87      0.86       509
        1560       0.92      0.90      0.91       464
        1920       0.89      0.90      0.89       215
        1940       0.93      0.86      0.8

## 2nd Pipeline: Vectorizers (Count & TF-IDF), Models (Log Reg, SVM, Xgboost, RF), GridSearch

In [None]:
# Get the list of French stop words because the TfidfVectorizer in scikit-learn 
# does not have built-in support for French stop words. 
french_stop_words = stopwords.words('french')

# Define the pipeline
pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),  # Placeholder for vectorizer (will be replaced in grid search)
    ('model', LogisticRegression())    # Placeholder for model (will be replaced in grid search)
])

# Define the parameter grid
param_grid = [
    # Logistic Regression with CountVectorizer
    {
        'vectorizer': [CountVectorizer()],  # Use CountVectorizer
        'vectorizer__stop_words': [french_stop_words],  # Use French stop words
        'vectorizer__ngram_range': [(1, 1), (1, 2)],  # Unigrams or unigrams + bigrams
        'vectorizer__max_df': [0.8, 0.9],
        'vectorizer__min_df': [1, 5],
        'model': [LogisticRegression()],
        'model__C': [0.1, 1, 10],  # Regularization strength
        'model__solver': ['lbfgs']
    },
    # Logistic Regression with TfidfVectorizer
    {
        'vectorizer': [TfidfVectorizer()],  # Use TfidfVectorizer
        'vectorizer__stop_words': [french_stop_words],  # Use French stop words
        'vectorizer__ngram_range': [(1, 1), (1, 2)],
        'vectorizer__max_df': [0.8, 0.9],
        'vectorizer__min_df': [1, 5],
        'model': [LogisticRegression()],
        'model__C': [0.1, 1, 10],
        'model__solver': ['lbfgs']
    },
    # SVM with CountVectorizer
    {
        'vectorizer': [CountVectorizer()],
        'vectorizer__stop_words': [french_stop_words],  # Use French stop words
        'vectorizer__ngram_range': [(1, 1), (1, 2)],
        'vectorizer__max_df': [0.8, 0.9],
        'vectorizer__min_df': [1, 5],
        'model': [SVC()],
        'model__C': [0.1, 1, 10],
        'model__kernel': ['linear', 'rbf']
    },
    # SVM with TfidfVectorizer
    {
        'vectorizer': [TfidfVectorizer()],
        'vectorizer__stop_words': [french_stop_words],  # Use French stop words
        'vectorizer__ngram_range': [(1, 1), (1, 2)],
        'vectorizer__max_df': [0.8, 0.9],
        'vectorizer__min_df': [1, 5],
        'model': [SVC()],
        'model__C': [0.1, 1, 10],
        'model__kernel': ['linear', 'rbf']
    },
    # XGBoost with CountVectorizer
    {
        'vectorizer': [CountVectorizer()],
        'vectorizer__stop_words': [french_stop_words],  # Use French stop words
        'vectorizer__ngram_range': [(1, 1), (1, 2)],
        'vectorizer__max_df': [0.8, 0.9],
        'vectorizer__min_df': [1, 5],
        'model': [XGBClassifier()],
        'model__n_estimators': [100, 200],
        'model__learning_rate': [0.01, 0.1, 0.2],
        'model__max_depth': [3, 5, 7]
    },
    # XGBoost with TfidfVectorizer
    {
        'vectorizer': [TfidfVectorizer()],
        'vectorizer__stop_words': [french_stop_words],  # Use French stop words
        'vectorizer__ngram_range': [(1, 1), (1, 2)],
        'vectorizer__max_df': [0.8, 0.9],
        'vectorizer__min_df': [1, 5],
        'model': [XGBClassifier()],
        'model__n_estimators': [100, 200],
        'model__learning_rate': [0.01, 0.1, 0.2],
        'model__max_depth': [3, 5, 7]
    },
    # Random Forest with CountVectorizer
    {
        'vectorizer': [CountVectorizer()],
        'vectorizer__stop_words': [french_stop_words],  # Use French stop words
        'vectorizer__ngram_range': [(1, 1), (1, 2)],
        'vectorizer__max_df': [0.8, 0.9],
        'vectorizer__min_df': [1, 5],
        'model': [RandomForestClassifier()],
        'model__n_estimators': [100, 200],
        'model__max_depth': [None, 10, 20]
    },
    # Random Forest with TfidfVectorizer
    {
        'vectorizer': [TfidfVectorizer()],
        'vectorizer__stop_words': [french_stop_words],  # Use French stop words
        'vectorizer__ngram_range': [(1, 1), (1, 2)],
        'vectorizer__max_df': [0.8, 0.9],
        'vectorizer__min_df': [1, 5],
        'model': [RandomForestClassifier()],
        'model__n_estimators': [100, 200],
        'model__max_depth': [None, 10, 20]
    }
]

# Perform grid search with weighted F1-score as the scoring metric
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1_weighted', verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best parameters and model evaluation
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Score (Weighted F1):", grid_search.best_score_)

# Evaluate on the test set using weighted F1-score
y_pred = grid_search.best_estimator_.predict(X_test)
weighted_f1 = f1_score(y_test, y_pred, average='weighted')
print("Test Set Weighted F1-Score:", weighted_f1)

# Optional: Detailed classification report
print(classification_report(y_test, y_pred))

Fitting 5 folds for each of 528 candidates, totalling 2640 fits
