# Classical ML Model Classification Pipeline

This Notebook combines Text vectorization (TF-IDF/CountVectorizer/ Bag Of Words) and classical ML-Models (Logistic Regression / SVM / XGBoost / Random Forest) to run a Classification on the cleaned and translated product texts (titles + descriptions)

## Import libs & read data

In [11]:
"""Initialising the DeepL result DataFrame
This script reads a CSV file containing the results of a DeepL translation analysis
and loads it into a pandas DataFrame for further processing."""

import os
import shutil
import json
import joblib
import pandas as pd
import unicodedata
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score, make_scorer
from xgboost import XGBClassifier  
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords


# File path to the CSV
file_path = "language_analysis/df_localization.csv"

# Read the CSV into a DataFrame
df_deepl = pd.read_csv(file_path)

# Display the first few rows of the DataFrame (optional)
print(df_deepl.head())

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/robertwilson/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


    productid     imageid  prdtypecode  bool_description lang  \
0  1000751210  1052839068         2583                 1   fr   
1  1001351342  1053177920         2280                 0   fr   
2   100227200   875832983         2280                 0   fr   
3   100254619   861686675         1280                 0   fr   
4  1002589006  1052857576         2403                 0   fr   

  deepL_translation                                        merged_text  
0               NaN  Cascade Mamba LED Ubbink La cascade Mamba LED ...  
1               NaN  L'animateur Des Temps Nouveaux N° 276 Du 19/6/...  
2               NaN  Char e Mensuel N°76 - Journal Plein D'humour E...  
3               NaN                      Doudou Plat L in Parme Tex -   
4               NaN  Lot De 3 Livres De Stephenie Meyer Et Alex F n...  


## Process DataFrame

In [12]:
"""The DataFrame is restructured to include only the relevant columns for further processing.
The column "prdtypecode" is retained. A new "text" column is created, which contains the DeepL translation if available.
If the translation is empty, the original text from "merged_text" is used instead."""

# Keep only the columns "prdtypecode", "deepL_translation", and "merged_text"
df_deepl = df_deepl[["prdtypecode", "deepL_translation", "merged_text"]]

# Create the "text" column: use "deepL_translation" if not empty, otherwise use "merged_text"
df_deepl["text"] = df_deepl.apply(
    lambda row: row["deepL_translation"] if pd.notna(row["deepL_translation"]) and row["deepL_translation"].strip() != "" else row["merged_text"],
    axis=1
)

# Drop the now-unnecessary columns "deepL_translation" and "merged_text"
df_deepl = df_deepl[["prdtypecode", "text"]]

# Display the first few rows of the updated DataFrame (optional)
print(df_deepl.head())


   prdtypecode                                               text
0         2583  Cascade Mamba LED Ubbink La cascade Mamba LED ...
1         2280  L'animateur Des Temps Nouveaux N° 276 Du 19/6/...
2         2280  Char e Mensuel N°76 - Journal Plein D'humour E...
3         1280                      Doudou Plat L in Parme Tex - 
4         2403  Lot De 3 Livres De Stephenie Meyer Et Alex F n...


In [13]:
print(df_deepl.shape)

(85540, 2)


## Feature & Target Data Split

In [14]:
# Split the DataFrame into features (X) and target (y)
X = df_deepl[["text"]]  # Feature DataFrame containing only the "text" column
y = df_deepl["prdtypecode"]  # Target Series containing the "prdtypecode" column

# Optional: Check the first few rows of X and y
print(X.head())
print(y.head())

                                                text
0  Cascade Mamba LED Ubbink La cascade Mamba LED ...
1  L'animateur Des Temps Nouveaux N° 276 Du 19/6/...
2  Char e Mensuel N°76 - Journal Plein D'humour E...
3                      Doudou Plat L in Parme Tex - 
4  Lot De 3 Livres De Stephenie Meyer Et Alex F n...
0    2583
1    2280
2    2280
3    1280
4    2403
Name: prdtypecode, dtype: int64


## Preprocessing & Test / Train Split

In [15]:
"""CountVectorizer automatically tokenizes the text, no custom tokenization has been applied."""


# Step 1: Preprocessing 
# Function to normalize accented characters
def normalize_text(text):
    # Convert accented characters to their base forms
    return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')

# Apply normalization to the "text" column
X["text"] = X["text"].apply(normalize_text)

# Step 2: Split the data
X_train, X_test, y_train, y_test = train_test_split(X["text"], y, test_size=0.2, random_state=42)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["text"] = X["text"].apply(normalize_text)


## First Pipeline: Only CountVectorizer, LogReg & GridSearch

In [None]:
# 46m 8.0s 18.04.2025 

"""THIS WAS JUST AN INTERMEDIATE VERSION OF THE PIPLINE"""

"""
# Define the pipeline
pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),  # Step 1: Vectorization
    ('model', LogisticRegression())    # Step 2: Logistic Regression
])

# Define the parameter grid
param_grid = {
    # Parameters for CountVectorizer
    'vectorizer__ngram_range': [(1, 1), (1, 2)],  # Unigrams or unigrams + bigrams
    'vectorizer__max_df': [0.8, 0.9, 1.0],       # Filter very frequent words
    'vectorizer__min_df': [1, 5, 10],            # Filter very rare words
    'vectorizer__max_features': [5000, 10000],   # Limit vocabulary size

    # Parameters for LogisticRegression
    'model__C': [0.1, 1, 10],                    # Regularization strength
    'model__penalty': ['l2'],                    # Regularization type
    'model__solver': ['lbfgs']                   # Solver for Logistic Regression
}

# Perform grid search
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best parameters and model evaluation
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Score:", grid_search.best_score_)

# Evaluate on the test set
y_pred = grid_search.best_estimator_.predict(X_test)
print(classification_report(y_test, y_pred))


"""

Fitting 5 folds for each of 108 candidates, totalling 540 fits


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best Parameters: {'model__C': 10, 'model__penalty': 'l2', 'model__solver': 'lbfgs', 'vectorizer__max_df': 0.8, 'vectorizer__max_features': 10000, 'vectorizer__min_df': 10, 'vectorizer__ngram_range': (1, 1)}
Best Cross-Validation Score: 0.8685567370699525
              precision    recall  f1-score   support

          10       0.71      0.82      0.76       794
        1140       0.86      0.86      0.86       587
        1160       0.96      0.96      0.96      1099
        1180       0.79      0.77      0.78       193
        1280       0.82      0.80      0.81       681
        1281       0.84      0.78      0.81       390
        1300       0.92      0.93      0.92       473
        1301       0.97      0.91      0.94       111
        1302       0.86      0.80      0.83       254
        1320       0.85      0.87      0.86       509
        1560       0.92      0.90      0.91       464
        1920       0.89      0.90      0.89       215
        1940       0.93      0.86      0.8

## 2nd Pipeline: Vectorizers (Count & TF-IDF), Models (Log Reg, SVM, Xgboost, RF), GridSearch

In [7]:
# 2,585 Minutes 21.04.2025

"""THIS MODEL SAVES NO INTERMEDIATE RESULTS, PROCESS CANCELLED AFTER 2,585 MINUTES WITH NO RESULTS
(n_jobs=3)"""


"""


# Get the list of French stop words because the TfidfVectorizer in scikit-learn 
# does not have built-in support for French stop words. 
french_stop_words = stopwords.words('french')

# Define the pipeline
pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),  # Placeholder for vectorizer (will be replaced in grid search)
    ('model', LogisticRegression())    # Placeholder for model (will be replaced in grid search)
])

# Define the parameter grid
param_grid = [
    # Logistic Regression with CountVectorizer
    {
        'vectorizer': [CountVectorizer()],  # Use CountVectorizer
        'vectorizer__stop_words': [french_stop_words],  # Use French stop words
        'vectorizer__ngram_range': [(1, 1), (1, 2)],  # Unigrams or unigrams + bigrams
        'vectorizer__max_df': [0.8, 0.9],
        'vectorizer__min_df': [1, 5],
        'model': [LogisticRegression()],
        'model__C': [0.1, 1, 10],  # Regularization strength
        'model__solver': ['lbfgs']
    },
    # Logistic Regression with TfidfVectorizer
    {
        'vectorizer': [TfidfVectorizer()],  # Use TfidfVectorizer
        'vectorizer__stop_words': [french_stop_words],  # Use French stop words
        'vectorizer__ngram_range': [(1, 1), (1, 2)],
        'vectorizer__max_df': [0.8, 0.9],
        'vectorizer__min_df': [1, 5],
        'model': [LogisticRegression()],
        'model__C': [0.1, 1, 10],
        'model__solver': ['lbfgs']
    },
    # SVM with CountVectorizer
    {
        'vectorizer': [CountVectorizer()],
        'vectorizer__stop_words': [french_stop_words],  # Use French stop words
        'vectorizer__ngram_range': [(1, 1), (1, 2)],
        'vectorizer__max_df': [0.8, 0.9],
        'vectorizer__min_df': [1, 5],
        'model': [SVC()],
        'model__C': [0.1, 1, 10],
        'model__kernel': ['linear', 'rbf']
    },
    # SVM with TfidfVectorizer
    {
        'vectorizer': [TfidfVectorizer()],
        'vectorizer__stop_words': [french_stop_words],  # Use French stop words
        'vectorizer__ngram_range': [(1, 1), (1, 2)],
        'vectorizer__max_df': [0.8, 0.9],
        'vectorizer__min_df': [1, 5],
        'model': [SVC()],
        'model__C': [0.1, 1, 10],
        'model__kernel': ['linear', 'rbf']
    },
    # XGBoost with CountVectorizer
    {
        'vectorizer': [CountVectorizer()],
        'vectorizer__stop_words': [french_stop_words],  # Use French stop words
        'vectorizer__ngram_range': [(1, 1), (1, 2)],
        'vectorizer__max_df': [0.8, 0.9],
        'vectorizer__min_df': [1, 5],
        'model': [XGBClassifier()],
        'model__n_estimators': [100, 200],
        'model__learning_rate': [0.01, 0.1, 0.2],
        'model__max_depth': [3, 5, 7]
    },
    # XGBoost with TfidfVectorizer
    {
        'vectorizer': [TfidfVectorizer()],
        'vectorizer__stop_words': [french_stop_words],  # Use French stop words
        'vectorizer__ngram_range': [(1, 1), (1, 2)],
        'vectorizer__max_df': [0.8, 0.9],
        'vectorizer__min_df': [1, 5],
        'model': [XGBClassifier()],
        'model__n_estimators': [100, 200],
        'model__learning_rate': [0.01, 0.1, 0.2],
        'model__max_depth': [3, 5, 7]
    },
    # Random Forest with CountVectorizer
    {
        'vectorizer': [CountVectorizer()],
        'vectorizer__stop_words': [french_stop_words],  # Use French stop words
        'vectorizer__ngram_range': [(1, 1), (1, 2)],
        'vectorizer__max_df': [0.8, 0.9],
        'vectorizer__min_df': [1, 5],
        'model': [RandomForestClassifier()],
        'model__n_estimators': [100, 200],
        'model__max_depth': [None, 10, 20]
    },
    # Random Forest with TfidfVectorizer
    {
        'vectorizer': [TfidfVectorizer()],
        'vectorizer__stop_words': [french_stop_words],  # Use French stop words
        'vectorizer__ngram_range': [(1, 1), (1, 2)],
        'vectorizer__max_df': [0.8, 0.9],
        'vectorizer__min_df': [1, 5],
        'model': [RandomForestClassifier()],
        'model__n_estimators': [100, 200],
        'model__max_depth': [None, 10, 20]
    }
]

# Perform grid search with weighted F1-score as the scoring metric
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1_weighted', verbose=1, n_jobs=3)#n_jobs=3 to keep from crashing (3 = half of my cores (Robert))
grid_search.fit(X_train, y_train)

# Best parameters and model evaluation
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Score (Weighted F1):", grid_search.best_score_)

# Evaluate on the test set using weighted F1-score
y_pred = grid_search.best_estimator_.predict(X_test)
weighted_f1 = f1_score(y_test, y_pred, average='weighted')
print("Test Set Weighted F1-Score:", weighted_f1)

# Optional: Detailed classification report
print(classification_report(y_test, y_pred))


"""

Fitting 5 folds for each of 528 candidates, totalling 2640 fits


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

KeyboardInterrupt: 

## 3. PIPELINE WITH CHECKPOINTS: Vectorizers (Count & TF-IDF), Models (Log Reg, SVM, Xgboost, RF), GridSearch

In [17]:
"""THE FOLLOWING CODE TOOK +1,000 MINUTES TO RUN AND WAS CANCELLED. NEXT STEP EXTERIORIZE VECTORIZATION FROM PIPELINE"""

"""
import os
import json
import joblib
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import GridSearchCV

# Get the list of French stop words
french_stop_words = stopwords.words('french')

# Define the pipeline
pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),  # Placeholder for vectorizer
    ('model', LogisticRegression())    # Placeholder for model
])

# Define the parameter grid
param_grid = [
    # (Your parameter grid remains unchanged)
    # Logistic Regression with CountVectorizer
    {
        'vectorizer': [CountVectorizer()],
        'vectorizer__stop_words': [french_stop_words],
        'vectorizer__ngram_range': [(1, 1), (1, 2)],
        'vectorizer__max_df': [0.8, 0.9],
        'vectorizer__min_df': [1, 5],
        'model': [LogisticRegression()],
        'model__C': [0.1, 1, 10],
        'model__solver': ['lbfgs']
    },

    # Logistic Regression with TfidfVectorizer
    {
        'vectorizer': [TfidfVectorizer()],  # Use TfidfVectorizer
        'vectorizer__stop_words': [french_stop_words],  # Use French stop words
        'vectorizer__ngram_range': [(1, 1), (1, 2)],
        'vectorizer__max_df': [0.8, 0.9],
        'vectorizer__min_df': [1, 5],
        'model': [LogisticRegression()],
        'model__C': [0.1, 1, 10],
        'model__solver': ['lbfgs']
    },
    # SVM with CountVectorizer
    {
        'vectorizer': [CountVectorizer()],
        'vectorizer__stop_words': [french_stop_words],  # Use French stop words
        'vectorizer__ngram_range': [(1, 1), (1, 2)],
        'vectorizer__max_df': [0.8, 0.9],
        'vectorizer__min_df': [1, 5],
        'model': [SVC()],
        'model__C': [0.1, 1, 10],
        'model__kernel': ['linear', 'rbf']
    },
    # SVM with TfidfVectorizer
    {
        'vectorizer': [TfidfVectorizer()],
        'vectorizer__stop_words': [french_stop_words],  # Use French stop words
        'vectorizer__ngram_range': [(1, 1), (1, 2)],
        'vectorizer__max_df': [0.8, 0.9],
        'vectorizer__min_df': [1, 5],
        'model': [SVC()],
        'model__C': [0.1, 1, 10],
        'model__kernel': ['linear', 'rbf']
    },
    # XGBoost with CountVectorizer
    {
        'vectorizer': [CountVectorizer()],
        'vectorizer__stop_words': [french_stop_words],  # Use French stop words
        'vectorizer__ngram_range': [(1, 1), (1, 2)],
        'vectorizer__max_df': [0.8, 0.9],
        'vectorizer__min_df': [1, 5],
        'model': [XGBClassifier()],
        'model__n_estimators': [100, 200],
        'model__learning_rate': [0.01, 0.1, 0.2],
        'model__max_depth': [3, 5, 7]
    },
    # XGBoost with TfidfVectorizer
    {
        'vectorizer': [TfidfVectorizer()],
        'vectorizer__stop_words': [french_stop_words],  # Use French stop words
        'vectorizer__ngram_range': [(1, 1), (1, 2)],
        'vectorizer__max_df': [0.8, 0.9],
        'vectorizer__min_df': [1, 5],
        'model': [XGBClassifier()],
        'model__n_estimators': [100, 200],
        'model__learning_rate': [0.01, 0.1, 0.2],
        'model__max_depth': [3, 5, 7]
    },
    # Random Forest with CountVectorizer
    {
        'vectorizer': [CountVectorizer()],
        'vectorizer__stop_words': [french_stop_words],  # Use French stop words
        'vectorizer__ngram_range': [(1, 1), (1, 2)],
        'vectorizer__max_df': [0.8, 0.9],
        'vectorizer__min_df': [1, 5],
        'model': [RandomForestClassifier()],
        'model__n_estimators': [100, 200],
        'model__max_depth': [None, 10, 20]
    },
    # Random Forest with TfidfVectorizer
    {
        'vectorizer': [TfidfVectorizer()],
        'vectorizer__stop_words': [french_stop_words],  # Use French stop words
        'vectorizer__ngram_range': [(1, 1), (1, 2)],
        'vectorizer__max_df': [0.8, 0.9],
        'vectorizer__min_df': [1, 5],
        'model': [RandomForestClassifier()],
        'model__n_estimators': [100, 200],
        'model__max_depth': [None, 10, 20]
    }
]

# Directory to save checkpoints and log file
checkpoint_dir = "checkpoints"
os.makedirs(checkpoint_dir, exist_ok=True)
log_file = os.path.join(checkpoint_dir, "progress_log.json")

# Load progress log if it exists
if os.path.exists(log_file):
    with open(log_file, "r") as f:
        completed_params = json.load(f)
else:
    completed_params = []

# Custom scorer for weighted F1 score
weighted_f1_scorer = make_scorer(f1_score, average='weighted')

# Iterate over parameter combinations
for i, params in enumerate(param_grid):
    checkpoint_file = os.path.join(checkpoint_dir, f"checkpoint_{i}.pkl")
    
    # Skip if this parameter combination has already been processed
    if i in completed_params:
        print(f"Skipping parameter combination {i}, already completed.")
        continue

    try:
        # Perform grid search for this parameter combination
        print(f"Processing parameter combination {i}...")
        grid_search = GridSearchCV(
            pipeline, [params], cv=5, scoring=weighted_f1_scorer, verbose=3, n_jobs=5
        )
        grid_search.fit(X_train, y_train)
        
        # Save the best model for this parameter combination
        joblib.dump(grid_search.best_estimator_, checkpoint_file)
        print(f"Saved checkpoint for parameter combination {i}.")
        
        # Print intermediate results (weighted F1 score)
        print(f"Weighted F1 Score for parameter combination {i}: {grid_search.best_score_}")
        
        # Update progress log
        completed_params.append(i)
        with open(log_file, "w") as f:
            json.dump(completed_params, f)
    except Exception as e:
        print(f"Error with parameter combination {i}: {e}")

# Evaluate the best model on the test set
best_model_file = os.path.join(checkpoint_dir, f"checkpoint_{completed_params[-1]}.pkl")
best_model = joblib.load(best_model_file)
y_pred = best_model.predict(X_test)
weighted_f1 = f1_score(y_test, y_pred, average='weighted')
print("Test Set Weighted F1-Score:", weighted_f1)

# Optional: Detailed classification report
print(classification_report(y_test, y_pred))

"""

Processing parameter combination 0...
Fitting 5 folds for each of 24 candidates, totalling 120 fits


Python(44819) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(44820) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(44821) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(44822) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(44823) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Traceback (most recent call last):
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 949, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 288, in __call__
    return self._score(parti

[CV 1/5] END model=LogisticRegression(), model__C=0.1, model__solver=lbfgs, vectorizer=CountVectorizer(), vectorizer__max_df=0.8, vectorizer__min_df=1, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=['au', 'aux', 'avec', 'ce', 'ces', 'dans', 'de', 'des', 'du', 'elle', 'en', 'et', 'eux', 'il', 'ils', 'je', 'la', 'le', 'les', 'leur', 'lui', 'ma', 'mais', 'me', 'même', 'mes', 'moi', 'mon', 'ne', 'nos', 'notre', 'nous', 'on', 'ou', 'par', 'pas', 'pour', 'qu', 'que', 'qui', 'sa', 'se', 'ses', 'son', 'sur', 'ta', 'te', 'tes', 'toi', 'ton', 'tu', 'un', 'une', 'vos', 'votre', 'vous', 'c', 'd', 'j', 'l', 'à', 'm', 'n', 's', 't', 'y', 'été', 'étée', 'étées', 'étés', 'étant', 'étante', 'étants', 'étantes', 'suis', 'es', 'est', 'sommes', 'êtes', 'sont', 'serai', 'seras', 'sera', 'serons', 'serez', 'seront', 'serais', 'serait', 'serions', 'seriez', 'seraient', 'étais', 'était', 'étions', 'étiez', 'étaient', 'fus', 'fut', 'fûmes', 'fûtes', 'furent', 'sois', 'soit', 'soyons', 'soyez', 'soient

Traceback (most recent call last):
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 949, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 288, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 380, in _score
    y_pred = method_caller(
        estimator,
    ...<2 lines>...
        pos_label=pos_label,
    )
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classi

[CV 5/5] END model=LogisticRegression(), model__C=0.1, model__solver=lbfgs, vectorizer=CountVectorizer(), vectorizer__max_df=0.8, vectorizer__min_df=1, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=['au', 'aux', 'avec', 'ce', 'ces', 'dans', 'de', 'des', 'du', 'elle', 'en', 'et', 'eux', 'il', 'ils', 'je', 'la', 'le', 'les', 'leur', 'lui', 'ma', 'mais', 'me', 'même', 'mes', 'moi', 'mon', 'ne', 'nos', 'notre', 'nous', 'on', 'ou', 'par', 'pas', 'pour', 'qu', 'que', 'qui', 'sa', 'se', 'ses', 'son', 'sur', 'ta', 'te', 'tes', 'toi', 'ton', 'tu', 'un', 'une', 'vos', 'votre', 'vous', 'c', 'd', 'j', 'l', 'à', 'm', 'n', 's', 't', 'y', 'été', 'étée', 'étées', 'étés', 'étant', 'étante', 'étants', 'étantes', 'suis', 'es', 'est', 'sommes', 'êtes', 'sont', 'serai', 'seras', 'sera', 'serons', 'serez', 'seront', 'serais', 'serait', 'serions', 'seriez', 'seraient', 'étais', 'était', 'étions', 'étiez', 'étaient', 'fus', 'fut', 'fûmes', 'fûtes', 'furent', 'sois', 'soit', 'soyons', 'soyez', 'soient

Traceback (most recent call last):
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 949, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 288, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 380, in _score
    y_pred = method_caller(
        estimator,
    ...<2 lines>...
        pos_label=pos_label,
    )
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classi

[CV 3/5] END model=LogisticRegression(), model__C=0.1, model__solver=lbfgs, vectorizer=CountVectorizer(), vectorizer__max_df=0.8, vectorizer__min_df=1, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=['au', 'aux', 'avec', 'ce', 'ces', 'dans', 'de', 'des', 'du', 'elle', 'en', 'et', 'eux', 'il', 'ils', 'je', 'la', 'le', 'les', 'leur', 'lui', 'ma', 'mais', 'me', 'même', 'mes', 'moi', 'mon', 'ne', 'nos', 'notre', 'nous', 'on', 'ou', 'par', 'pas', 'pour', 'qu', 'que', 'qui', 'sa', 'se', 'ses', 'son', 'sur', 'ta', 'te', 'tes', 'toi', 'ton', 'tu', 'un', 'une', 'vos', 'votre', 'vous', 'c', 'd', 'j', 'l', 'à', 'm', 'n', 's', 't', 'y', 'été', 'étée', 'étées', 'étés', 'étant', 'étante', 'étants', 'étantes', 'suis', 'es', 'est', 'sommes', 'êtes', 'sont', 'serai', 'seras', 'sera', 'serons', 'serez', 'seront', 'serais', 'serait', 'serions', 'seriez', 'seraient', 'étais', 'était', 'étions', 'étiez', 'étaient', 'fus', 'fut', 'fûmes', 'fûtes', 'furent', 'sois', 'soit', 'soyons', 'soyez', 'soient

Traceback (most recent call last):
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 949, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 288, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 380, in _score
    y_pred = method_caller(
        estimator,
    ...<2 lines>...
        pos_label=pos_label,
    )
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classi

[CV 5/5] END model=LogisticRegression(), model__C=0.1, model__solver=lbfgs, vectorizer=CountVectorizer(), vectorizer__max_df=0.8, vectorizer__min_df=1, vectorizer__ngram_range=(1, 2), vectorizer__stop_words=['au', 'aux', 'avec', 'ce', 'ces', 'dans', 'de', 'des', 'du', 'elle', 'en', 'et', 'eux', 'il', 'ils', 'je', 'la', 'le', 'les', 'leur', 'lui', 'ma', 'mais', 'me', 'même', 'mes', 'moi', 'mon', 'ne', 'nos', 'notre', 'nous', 'on', 'ou', 'par', 'pas', 'pour', 'qu', 'que', 'qui', 'sa', 'se', 'ses', 'son', 'sur', 'ta', 'te', 'tes', 'toi', 'ton', 'tu', 'un', 'une', 'vos', 'votre', 'vous', 'c', 'd', 'j', 'l', 'à', 'm', 'n', 's', 't', 'y', 'été', 'étée', 'étées', 'étés', 'étant', 'étante', 'étants', 'étantes', 'suis', 'es', 'est', 'sommes', 'êtes', 'sont', 'serai', 'seras', 'sera', 'serons', 'serez', 'seront', 'serais', 'serait', 'serions', 'seriez', 'seraient', 'étais', 'était', 'étions', 'étiez', 'étaient', 'fus', 'fut', 'fûmes', 'fûtes', 'furent', 'sois', 'soit', 'soyons', 'soyez', 'soient

Traceback (most recent call last):
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 949, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 288, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 380, in _score
    y_pred = method_caller(
        estimator,
    ...<2 lines>...
        pos_label=pos_label,
    )
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classi

[CV 1/5] END model=LogisticRegression(), model__C=0.1, model__solver=lbfgs, vectorizer=CountVectorizer(), vectorizer__max_df=0.8, vectorizer__min_df=5, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=['au', 'aux', 'avec', 'ce', 'ces', 'dans', 'de', 'des', 'du', 'elle', 'en', 'et', 'eux', 'il', 'ils', 'je', 'la', 'le', 'les', 'leur', 'lui', 'ma', 'mais', 'me', 'même', 'mes', 'moi', 'mon', 'ne', 'nos', 'notre', 'nous', 'on', 'ou', 'par', 'pas', 'pour', 'qu', 'que', 'qui', 'sa', 'se', 'ses', 'son', 'sur', 'ta', 'te', 'tes', 'toi', 'ton', 'tu', 'un', 'une', 'vos', 'votre', 'vous', 'c', 'd', 'j', 'l', 'à', 'm', 'n', 's', 't', 'y', 'été', 'étée', 'étées', 'étés', 'étant', 'étante', 'étants', 'étantes', 'suis', 'es', 'est', 'sommes', 'êtes', 'sont', 'serai', 'seras', 'sera', 'serons', 'serez', 'seront', 'serais', 'serait', 'serions', 'seriez', 'seraient', 'étais', 'était', 'étions', 'étiez', 'étaient', 'fus', 'fut', 'fûmes', 'fûtes', 'furent', 'sois', 'soit', 'soyons', 'soyez', 'soient

Traceback (most recent call last):
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 949, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 288, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 380, in _score
    y_pred = method_caller(
        estimator,
    ...<2 lines>...
        pos_label=pos_label,
    )
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classi

[CV 2/5] END model=LogisticRegression(), model__C=0.1, model__solver=lbfgs, vectorizer=CountVectorizer(), vectorizer__max_df=0.8, vectorizer__min_df=5, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=['au', 'aux', 'avec', 'ce', 'ces', 'dans', 'de', 'des', 'du', 'elle', 'en', 'et', 'eux', 'il', 'ils', 'je', 'la', 'le', 'les', 'leur', 'lui', 'ma', 'mais', 'me', 'même', 'mes', 'moi', 'mon', 'ne', 'nos', 'notre', 'nous', 'on', 'ou', 'par', 'pas', 'pour', 'qu', 'que', 'qui', 'sa', 'se', 'ses', 'son', 'sur', 'ta', 'te', 'tes', 'toi', 'ton', 'tu', 'un', 'une', 'vos', 'votre', 'vous', 'c', 'd', 'j', 'l', 'à', 'm', 'n', 's', 't', 'y', 'été', 'étée', 'étées', 'étés', 'étant', 'étante', 'étants', 'étantes', 'suis', 'es', 'est', 'sommes', 'êtes', 'sont', 'serai', 'seras', 'sera', 'serons', 'serez', 'seront', 'serais', 'serait', 'serions', 'seriez', 'seraient', 'étais', 'était', 'étions', 'étiez', 'étaient', 'fus', 'fut', 'fûmes', 'fûtes', 'furent', 'sois', 'soit', 'soyons', 'soyez', 'soient

Traceback (most recent call last):
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 949, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 288, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 380, in _score
    y_pred = method_caller(
        estimator,
    ...<2 lines>...
        pos_label=pos_label,
    )
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classi

[CV 3/5] END model=LogisticRegression(), model__C=0.1, model__solver=lbfgs, vectorizer=CountVectorizer(), vectorizer__max_df=0.8, vectorizer__min_df=5, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=['au', 'aux', 'avec', 'ce', 'ces', 'dans', 'de', 'des', 'du', 'elle', 'en', 'et', 'eux', 'il', 'ils', 'je', 'la', 'le', 'les', 'leur', 'lui', 'ma', 'mais', 'me', 'même', 'mes', 'moi', 'mon', 'ne', 'nos', 'notre', 'nous', 'on', 'ou', 'par', 'pas', 'pour', 'qu', 'que', 'qui', 'sa', 'se', 'ses', 'son', 'sur', 'ta', 'te', 'tes', 'toi', 'ton', 'tu', 'un', 'une', 'vos', 'votre', 'vous', 'c', 'd', 'j', 'l', 'à', 'm', 'n', 's', 't', 'y', 'été', 'étée', 'étées', 'étés', 'étant', 'étante', 'étants', 'étantes', 'suis', 'es', 'est', 'sommes', 'êtes', 'sont', 'serai', 'seras', 'sera', 'serons', 'serez', 'seront', 'serais', 'serait', 'serions', 'seriez', 'seraient', 'étais', 'était', 'étions', 'étiez', 'étaient', 'fus', 'fut', 'fûmes', 'fûtes', 'furent', 'sois', 'soit', 'soyons', 'soyez', 'soient

Traceback (most recent call last):
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 949, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 288, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 380, in _score
    y_pred = method_caller(
        estimator,
    ...<2 lines>...
        pos_label=pos_label,
    )
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classi

[CV 4/5] END model=LogisticRegression(), model__C=0.1, model__solver=lbfgs, vectorizer=CountVectorizer(), vectorizer__max_df=0.8, vectorizer__min_df=5, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=['au', 'aux', 'avec', 'ce', 'ces', 'dans', 'de', 'des', 'du', 'elle', 'en', 'et', 'eux', 'il', 'ils', 'je', 'la', 'le', 'les', 'leur', 'lui', 'ma', 'mais', 'me', 'même', 'mes', 'moi', 'mon', 'ne', 'nos', 'notre', 'nous', 'on', 'ou', 'par', 'pas', 'pour', 'qu', 'que', 'qui', 'sa', 'se', 'ses', 'son', 'sur', 'ta', 'te', 'tes', 'toi', 'ton', 'tu', 'un', 'une', 'vos', 'votre', 'vous', 'c', 'd', 'j', 'l', 'à', 'm', 'n', 's', 't', 'y', 'été', 'étée', 'étées', 'étés', 'étant', 'étante', 'étants', 'étantes', 'suis', 'es', 'est', 'sommes', 'êtes', 'sont', 'serai', 'seras', 'sera', 'serons', 'serez', 'seront', 'serais', 'serait', 'serions', 'seriez', 'seraient', 'étais', 'était', 'étions', 'étiez', 'étaient', 'fus', 'fut', 'fûmes', 'fûtes', 'furent', 'sois', 'soit', 'soyons', 'soyez', 'soient

Traceback (most recent call last):
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 949, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 288, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 380, in _score
    y_pred = method_caller(
        estimator,
    ...<2 lines>...
        pos_label=pos_label,
    )
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classi

[CV 5/5] END model=LogisticRegression(), model__C=0.1, model__solver=lbfgs, vectorizer=CountVectorizer(), vectorizer__max_df=0.8, vectorizer__min_df=5, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=['au', 'aux', 'avec', 'ce', 'ces', 'dans', 'de', 'des', 'du', 'elle', 'en', 'et', 'eux', 'il', 'ils', 'je', 'la', 'le', 'les', 'leur', 'lui', 'ma', 'mais', 'me', 'même', 'mes', 'moi', 'mon', 'ne', 'nos', 'notre', 'nous', 'on', 'ou', 'par', 'pas', 'pour', 'qu', 'que', 'qui', 'sa', 'se', 'ses', 'son', 'sur', 'ta', 'te', 'tes', 'toi', 'ton', 'tu', 'un', 'une', 'vos', 'votre', 'vous', 'c', 'd', 'j', 'l', 'à', 'm', 'n', 's', 't', 'y', 'été', 'étée', 'étées', 'étés', 'étant', 'étante', 'étants', 'étantes', 'suis', 'es', 'est', 'sommes', 'êtes', 'sont', 'serai', 'seras', 'sera', 'serons', 'serez', 'seront', 'serais', 'serait', 'serions', 'seriez', 'seraient', 'étais', 'était', 'étions', 'étiez', 'étaient', 'fus', 'fut', 'fûmes', 'fûtes', 'furent', 'sois', 'soit', 'soyons', 'soyez', 'soient

Traceback (most recent call last):
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 949, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 288, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 380, in _score
    y_pred = method_caller(
        estimator,
    ...<2 lines>...
        pos_label=pos_label,
    )
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classi

[CV 1/5] END model=LogisticRegression(), model__C=0.1, model__solver=lbfgs, vectorizer=CountVectorizer(), vectorizer__max_df=0.8, vectorizer__min_df=5, vectorizer__ngram_range=(1, 2), vectorizer__stop_words=['au', 'aux', 'avec', 'ce', 'ces', 'dans', 'de', 'des', 'du', 'elle', 'en', 'et', 'eux', 'il', 'ils', 'je', 'la', 'le', 'les', 'leur', 'lui', 'ma', 'mais', 'me', 'même', 'mes', 'moi', 'mon', 'ne', 'nos', 'notre', 'nous', 'on', 'ou', 'par', 'pas', 'pour', 'qu', 'que', 'qui', 'sa', 'se', 'ses', 'son', 'sur', 'ta', 'te', 'tes', 'toi', 'ton', 'tu', 'un', 'une', 'vos', 'votre', 'vous', 'c', 'd', 'j', 'l', 'à', 'm', 'n', 's', 't', 'y', 'été', 'étée', 'étées', 'étés', 'étant', 'étante', 'étants', 'étantes', 'suis', 'es', 'est', 'sommes', 'êtes', 'sont', 'serai', 'seras', 'sera', 'serons', 'serez', 'seront', 'serais', 'serait', 'serions', 'seriez', 'seraient', 'étais', 'était', 'étions', 'étiez', 'étaient', 'fus', 'fut', 'fûmes', 'fûtes', 'furent', 'sois', 'soit', 'soyons', 'soyez', 'soient

Traceback (most recent call last):
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 949, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 288, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 380, in _score
    y_pred = method_caller(
        estimator,
    ...<2 lines>...
        pos_label=pos_label,
    )
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classi

[CV 2/5] END model=LogisticRegression(), model__C=0.1, model__solver=lbfgs, vectorizer=CountVectorizer(), vectorizer__max_df=0.8, vectorizer__min_df=5, vectorizer__ngram_range=(1, 2), vectorizer__stop_words=['au', 'aux', 'avec', 'ce', 'ces', 'dans', 'de', 'des', 'du', 'elle', 'en', 'et', 'eux', 'il', 'ils', 'je', 'la', 'le', 'les', 'leur', 'lui', 'ma', 'mais', 'me', 'même', 'mes', 'moi', 'mon', 'ne', 'nos', 'notre', 'nous', 'on', 'ou', 'par', 'pas', 'pour', 'qu', 'que', 'qui', 'sa', 'se', 'ses', 'son', 'sur', 'ta', 'te', 'tes', 'toi', 'ton', 'tu', 'un', 'une', 'vos', 'votre', 'vous', 'c', 'd', 'j', 'l', 'à', 'm', 'n', 's', 't', 'y', 'été', 'étée', 'étées', 'étés', 'étant', 'étante', 'étants', 'étantes', 'suis', 'es', 'est', 'sommes', 'êtes', 'sont', 'serai', 'seras', 'sera', 'serons', 'serez', 'seront', 'serais', 'serait', 'serions', 'seriez', 'seraient', 'étais', 'était', 'étions', 'étiez', 'étaient', 'fus', 'fut', 'fûmes', 'fûtes', 'furent', 'sois', 'soit', 'soyons', 'soyez', 'soient

Traceback (most recent call last):
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 949, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 288, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 380, in _score
    y_pred = method_caller(
        estimator,
    ...<2 lines>...
        pos_label=pos_label,
    )
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classi

[CV 3/5] END model=LogisticRegression(), model__C=0.1, model__solver=lbfgs, vectorizer=CountVectorizer(), vectorizer__max_df=0.8, vectorizer__min_df=5, vectorizer__ngram_range=(1, 2), vectorizer__stop_words=['au', 'aux', 'avec', 'ce', 'ces', 'dans', 'de', 'des', 'du', 'elle', 'en', 'et', 'eux', 'il', 'ils', 'je', 'la', 'le', 'les', 'leur', 'lui', 'ma', 'mais', 'me', 'même', 'mes', 'moi', 'mon', 'ne', 'nos', 'notre', 'nous', 'on', 'ou', 'par', 'pas', 'pour', 'qu', 'que', 'qui', 'sa', 'se', 'ses', 'son', 'sur', 'ta', 'te', 'tes', 'toi', 'ton', 'tu', 'un', 'une', 'vos', 'votre', 'vous', 'c', 'd', 'j', 'l', 'à', 'm', 'n', 's', 't', 'y', 'été', 'étée', 'étées', 'étés', 'étant', 'étante', 'étants', 'étantes', 'suis', 'es', 'est', 'sommes', 'êtes', 'sont', 'serai', 'seras', 'sera', 'serons', 'serez', 'seront', 'serais', 'serait', 'serions', 'seriez', 'seraient', 'étais', 'était', 'étions', 'étiez', 'étaient', 'fus', 'fut', 'fûmes', 'fûtes', 'furent', 'sois', 'soit', 'soyons', 'soyez', 'soient

Traceback (most recent call last):
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 949, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 288, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 380, in _score
    y_pred = method_caller(
        estimator,
    ...<2 lines>...
        pos_label=pos_label,
    )
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classi

[CV 4/5] END model=LogisticRegression(), model__C=0.1, model__solver=lbfgs, vectorizer=CountVectorizer(), vectorizer__max_df=0.8, vectorizer__min_df=5, vectorizer__ngram_range=(1, 2), vectorizer__stop_words=['au', 'aux', 'avec', 'ce', 'ces', 'dans', 'de', 'des', 'du', 'elle', 'en', 'et', 'eux', 'il', 'ils', 'je', 'la', 'le', 'les', 'leur', 'lui', 'ma', 'mais', 'me', 'même', 'mes', 'moi', 'mon', 'ne', 'nos', 'notre', 'nous', 'on', 'ou', 'par', 'pas', 'pour', 'qu', 'que', 'qui', 'sa', 'se', 'ses', 'son', 'sur', 'ta', 'te', 'tes', 'toi', 'ton', 'tu', 'un', 'une', 'vos', 'votre', 'vous', 'c', 'd', 'j', 'l', 'à', 'm', 'n', 's', 't', 'y', 'été', 'étée', 'étées', 'étés', 'étant', 'étante', 'étants', 'étantes', 'suis', 'es', 'est', 'sommes', 'êtes', 'sont', 'serai', 'seras', 'sera', 'serons', 'serez', 'seront', 'serais', 'serait', 'serions', 'seriez', 'seraient', 'étais', 'était', 'étions', 'étiez', 'étaient', 'fus', 'fut', 'fûmes', 'fûtes', 'furent', 'sois', 'soit', 'soyons', 'soyez', 'soient

Traceback (most recent call last):
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 949, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 288, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 380, in _score
    y_pred = method_caller(
        estimator,
    ...<2 lines>...
        pos_label=pos_label,
    )
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classi

[CV 5/5] END model=LogisticRegression(), model__C=0.1, model__solver=lbfgs, vectorizer=CountVectorizer(), vectorizer__max_df=0.8, vectorizer__min_df=5, vectorizer__ngram_range=(1, 2), vectorizer__stop_words=['au', 'aux', 'avec', 'ce', 'ces', 'dans', 'de', 'des', 'du', 'elle', 'en', 'et', 'eux', 'il', 'ils', 'je', 'la', 'le', 'les', 'leur', 'lui', 'ma', 'mais', 'me', 'même', 'mes', 'moi', 'mon', 'ne', 'nos', 'notre', 'nous', 'on', 'ou', 'par', 'pas', 'pour', 'qu', 'que', 'qui', 'sa', 'se', 'ses', 'son', 'sur', 'ta', 'te', 'tes', 'toi', 'ton', 'tu', 'un', 'une', 'vos', 'votre', 'vous', 'c', 'd', 'j', 'l', 'à', 'm', 'n', 's', 't', 'y', 'été', 'étée', 'étées', 'étés', 'étant', 'étante', 'étants', 'étantes', 'suis', 'es', 'est', 'sommes', 'êtes', 'sont', 'serai', 'seras', 'sera', 'serons', 'serez', 'seront', 'serais', 'serait', 'serions', 'seriez', 'seraient', 'étais', 'était', 'étions', 'étiez', 'étaient', 'fus', 'fut', 'fûmes', 'fûtes', 'furent', 'sois', 'soit', 'soyons', 'soyez', 'soient

Traceback (most recent call last):
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 949, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 288, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 380, in _score
    y_pred = method_caller(
        estimator,
    ...<2 lines>...
        pos_label=pos_label,
    )
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classi

[CV 1/5] END model=LogisticRegression(), model__C=0.1, model__solver=lbfgs, vectorizer=CountVectorizer(), vectorizer__max_df=0.9, vectorizer__min_df=1, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=['au', 'aux', 'avec', 'ce', 'ces', 'dans', 'de', 'des', 'du', 'elle', 'en', 'et', 'eux', 'il', 'ils', 'je', 'la', 'le', 'les', 'leur', 'lui', 'ma', 'mais', 'me', 'même', 'mes', 'moi', 'mon', 'ne', 'nos', 'notre', 'nous', 'on', 'ou', 'par', 'pas', 'pour', 'qu', 'que', 'qui', 'sa', 'se', 'ses', 'son', 'sur', 'ta', 'te', 'tes', 'toi', 'ton', 'tu', 'un', 'une', 'vos', 'votre', 'vous', 'c', 'd', 'j', 'l', 'à', 'm', 'n', 's', 't', 'y', 'été', 'étée', 'étées', 'étés', 'étant', 'étante', 'étants', 'étantes', 'suis', 'es', 'est', 'sommes', 'êtes', 'sont', 'serai', 'seras', 'sera', 'serons', 'serez', 'seront', 'serais', 'serait', 'serions', 'seriez', 'seraient', 'étais', 'était', 'étions', 'étiez', 'étaient', 'fus', 'fut', 'fûmes', 'fûtes', 'furent', 'sois', 'soit', 'soyons', 'soyez', 'soient

Traceback (most recent call last):
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 949, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 288, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 380, in _score
    y_pred = method_caller(
        estimator,
    ...<2 lines>...
        pos_label=pos_label,
    )
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classi

[CV 2/5] END model=LogisticRegression(), model__C=0.1, model__solver=lbfgs, vectorizer=CountVectorizer(), vectorizer__max_df=0.9, vectorizer__min_df=1, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=['au', 'aux', 'avec', 'ce', 'ces', 'dans', 'de', 'des', 'du', 'elle', 'en', 'et', 'eux', 'il', 'ils', 'je', 'la', 'le', 'les', 'leur', 'lui', 'ma', 'mais', 'me', 'même', 'mes', 'moi', 'mon', 'ne', 'nos', 'notre', 'nous', 'on', 'ou', 'par', 'pas', 'pour', 'qu', 'que', 'qui', 'sa', 'se', 'ses', 'son', 'sur', 'ta', 'te', 'tes', 'toi', 'ton', 'tu', 'un', 'une', 'vos', 'votre', 'vous', 'c', 'd', 'j', 'l', 'à', 'm', 'n', 's', 't', 'y', 'été', 'étée', 'étées', 'étés', 'étant', 'étante', 'étants', 'étantes', 'suis', 'es', 'est', 'sommes', 'êtes', 'sont', 'serai', 'seras', 'sera', 'serons', 'serez', 'seront', 'serais', 'serait', 'serions', 'seriez', 'seraient', 'étais', 'était', 'étions', 'étiez', 'étaient', 'fus', 'fut', 'fûmes', 'fûtes', 'furent', 'sois', 'soit', 'soyons', 'soyez', 'soient

Traceback (most recent call last):
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 949, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 288, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 380, in _score
    y_pred = method_caller(
        estimator,
    ...<2 lines>...
        pos_label=pos_label,
    )
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classi

[CV 4/5] END model=LogisticRegression(), model__C=0.1, model__solver=lbfgs, vectorizer=CountVectorizer(), vectorizer__max_df=0.9, vectorizer__min_df=1, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=['au', 'aux', 'avec', 'ce', 'ces', 'dans', 'de', 'des', 'du', 'elle', 'en', 'et', 'eux', 'il', 'ils', 'je', 'la', 'le', 'les', 'leur', 'lui', 'ma', 'mais', 'me', 'même', 'mes', 'moi', 'mon', 'ne', 'nos', 'notre', 'nous', 'on', 'ou', 'par', 'pas', 'pour', 'qu', 'que', 'qui', 'sa', 'se', 'ses', 'son', 'sur', 'ta', 'te', 'tes', 'toi', 'ton', 'tu', 'un', 'une', 'vos', 'votre', 'vous', 'c', 'd', 'j', 'l', 'à', 'm', 'n', 's', 't', 'y', 'été', 'étée', 'étées', 'étés', 'étant', 'étante', 'étants', 'étantes', 'suis', 'es', 'est', 'sommes', 'êtes', 'sont', 'serai', 'seras', 'sera', 'serons', 'serez', 'seront', 'serais', 'serait', 'serions', 'seriez', 'seraient', 'étais', 'était', 'étions', 'étiez', 'étaient', 'fus', 'fut', 'fûmes', 'fûtes', 'furent', 'sois', 'soit', 'soyons', 'soyez', 'soient

Traceback (most recent call last):
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 949, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 288, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 380, in _score
    y_pred = method_caller(
        estimator,
    ...<2 lines>...
        pos_label=pos_label,
    )
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classi

[CV 1/5] END model=LogisticRegression(), model__C=0.1, model__solver=lbfgs, vectorizer=CountVectorizer(), vectorizer__max_df=0.9, vectorizer__min_df=1, vectorizer__ngram_range=(1, 2), vectorizer__stop_words=['au', 'aux', 'avec', 'ce', 'ces', 'dans', 'de', 'des', 'du', 'elle', 'en', 'et', 'eux', 'il', 'ils', 'je', 'la', 'le', 'les', 'leur', 'lui', 'ma', 'mais', 'me', 'même', 'mes', 'moi', 'mon', 'ne', 'nos', 'notre', 'nous', 'on', 'ou', 'par', 'pas', 'pour', 'qu', 'que', 'qui', 'sa', 'se', 'ses', 'son', 'sur', 'ta', 'te', 'tes', 'toi', 'ton', 'tu', 'un', 'une', 'vos', 'votre', 'vous', 'c', 'd', 'j', 'l', 'à', 'm', 'n', 's', 't', 'y', 'été', 'étée', 'étées', 'étés', 'étant', 'étante', 'étants', 'étantes', 'suis', 'es', 'est', 'sommes', 'êtes', 'sont', 'serai', 'seras', 'sera', 'serons', 'serez', 'seront', 'serais', 'serait', 'serions', 'seriez', 'seraient', 'étais', 'était', 'étions', 'étiez', 'étaient', 'fus', 'fut', 'fûmes', 'fûtes', 'furent', 'sois', 'soit', 'soyons', 'soyez', 'soient

Traceback (most recent call last):
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 949, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 288, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 380, in _score
    y_pred = method_caller(
        estimator,
    ...<2 lines>...
        pos_label=pos_label,
    )
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classi

[CV 2/5] END model=LogisticRegression(), model__C=0.1, model__solver=lbfgs, vectorizer=CountVectorizer(), vectorizer__max_df=0.9, vectorizer__min_df=1, vectorizer__ngram_range=(1, 2), vectorizer__stop_words=['au', 'aux', 'avec', 'ce', 'ces', 'dans', 'de', 'des', 'du', 'elle', 'en', 'et', 'eux', 'il', 'ils', 'je', 'la', 'le', 'les', 'leur', 'lui', 'ma', 'mais', 'me', 'même', 'mes', 'moi', 'mon', 'ne', 'nos', 'notre', 'nous', 'on', 'ou', 'par', 'pas', 'pour', 'qu', 'que', 'qui', 'sa', 'se', 'ses', 'son', 'sur', 'ta', 'te', 'tes', 'toi', 'ton', 'tu', 'un', 'une', 'vos', 'votre', 'vous', 'c', 'd', 'j', 'l', 'à', 'm', 'n', 's', 't', 'y', 'été', 'étée', 'étées', 'étés', 'étant', 'étante', 'étants', 'étantes', 'suis', 'es', 'est', 'sommes', 'êtes', 'sont', 'serai', 'seras', 'sera', 'serons', 'serez', 'seront', 'serais', 'serait', 'serions', 'seriez', 'seraient', 'étais', 'était', 'étions', 'étiez', 'étaient', 'fus', 'fut', 'fûmes', 'fûtes', 'furent', 'sois', 'soit', 'soyons', 'soyez', 'soient

Traceback (most recent call last):
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 949, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 288, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 380, in _score
    y_pred = method_caller(
        estimator,
    ...<2 lines>...
        pos_label=pos_label,
    )
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classi

[CV 3/5] END model=LogisticRegression(), model__C=0.1, model__solver=lbfgs, vectorizer=CountVectorizer(), vectorizer__max_df=0.9, vectorizer__min_df=1, vectorizer__ngram_range=(1, 2), vectorizer__stop_words=['au', 'aux', 'avec', 'ce', 'ces', 'dans', 'de', 'des', 'du', 'elle', 'en', 'et', 'eux', 'il', 'ils', 'je', 'la', 'le', 'les', 'leur', 'lui', 'ma', 'mais', 'me', 'même', 'mes', 'moi', 'mon', 'ne', 'nos', 'notre', 'nous', 'on', 'ou', 'par', 'pas', 'pour', 'qu', 'que', 'qui', 'sa', 'se', 'ses', 'son', 'sur', 'ta', 'te', 'tes', 'toi', 'ton', 'tu', 'un', 'une', 'vos', 'votre', 'vous', 'c', 'd', 'j', 'l', 'à', 'm', 'n', 's', 't', 'y', 'été', 'étée', 'étées', 'étés', 'étant', 'étante', 'étants', 'étantes', 'suis', 'es', 'est', 'sommes', 'êtes', 'sont', 'serai', 'seras', 'sera', 'serons', 'serez', 'seront', 'serais', 'serait', 'serions', 'seriez', 'seraient', 'étais', 'était', 'étions', 'étiez', 'étaient', 'fus', 'fut', 'fûmes', 'fûtes', 'furent', 'sois', 'soit', 'soyons', 'soyez', 'soient

Traceback (most recent call last):
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 949, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 288, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 380, in _score
    y_pred = method_caller(
        estimator,
    ...<2 lines>...
        pos_label=pos_label,
    )
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classi

[CV 4/5] END model=LogisticRegression(), model__C=0.1, model__solver=lbfgs, vectorizer=CountVectorizer(), vectorizer__max_df=0.9, vectorizer__min_df=1, vectorizer__ngram_range=(1, 2), vectorizer__stop_words=['au', 'aux', 'avec', 'ce', 'ces', 'dans', 'de', 'des', 'du', 'elle', 'en', 'et', 'eux', 'il', 'ils', 'je', 'la', 'le', 'les', 'leur', 'lui', 'ma', 'mais', 'me', 'même', 'mes', 'moi', 'mon', 'ne', 'nos', 'notre', 'nous', 'on', 'ou', 'par', 'pas', 'pour', 'qu', 'que', 'qui', 'sa', 'se', 'ses', 'son', 'sur', 'ta', 'te', 'tes', 'toi', 'ton', 'tu', 'un', 'une', 'vos', 'votre', 'vous', 'c', 'd', 'j', 'l', 'à', 'm', 'n', 's', 't', 'y', 'été', 'étée', 'étées', 'étés', 'étant', 'étante', 'étants', 'étantes', 'suis', 'es', 'est', 'sommes', 'êtes', 'sont', 'serai', 'seras', 'sera', 'serons', 'serez', 'seront', 'serais', 'serait', 'serions', 'seriez', 'seraient', 'étais', 'était', 'étions', 'étiez', 'étaient', 'fus', 'fut', 'fûmes', 'fûtes', 'furent', 'sois', 'soit', 'soyons', 'soyez', 'soient

Traceback (most recent call last):
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 949, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 288, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 380, in _score
    y_pred = method_caller(
        estimator,
    ...<2 lines>...
        pos_label=pos_label,
    )
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classi

[CV 5/5] END model=LogisticRegression(), model__C=0.1, model__solver=lbfgs, vectorizer=CountVectorizer(), vectorizer__max_df=0.9, vectorizer__min_df=1, vectorizer__ngram_range=(1, 2), vectorizer__stop_words=['au', 'aux', 'avec', 'ce', 'ces', 'dans', 'de', 'des', 'du', 'elle', 'en', 'et', 'eux', 'il', 'ils', 'je', 'la', 'le', 'les', 'leur', 'lui', 'ma', 'mais', 'me', 'même', 'mes', 'moi', 'mon', 'ne', 'nos', 'notre', 'nous', 'on', 'ou', 'par', 'pas', 'pour', 'qu', 'que', 'qui', 'sa', 'se', 'ses', 'son', 'sur', 'ta', 'te', 'tes', 'toi', 'ton', 'tu', 'un', 'une', 'vos', 'votre', 'vous', 'c', 'd', 'j', 'l', 'à', 'm', 'n', 's', 't', 'y', 'été', 'étée', 'étées', 'étés', 'étant', 'étante', 'étants', 'étantes', 'suis', 'es', 'est', 'sommes', 'êtes', 'sont', 'serai', 'seras', 'sera', 'serons', 'serez', 'seront', 'serais', 'serait', 'serions', 'seriez', 'seraient', 'étais', 'était', 'étions', 'étiez', 'étaient', 'fus', 'fut', 'fûmes', 'fûtes', 'furent', 'sois', 'soit', 'soyons', 'soyez', 'soient

Traceback (most recent call last):
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 949, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 288, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 380, in _score
    y_pred = method_caller(
        estimator,
    ...<2 lines>...
        pos_label=pos_label,
    )
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classi

[CV 1/5] END model=LogisticRegression(), model__C=0.1, model__solver=lbfgs, vectorizer=CountVectorizer(), vectorizer__max_df=0.9, vectorizer__min_df=5, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=['au', 'aux', 'avec', 'ce', 'ces', 'dans', 'de', 'des', 'du', 'elle', 'en', 'et', 'eux', 'il', 'ils', 'je', 'la', 'le', 'les', 'leur', 'lui', 'ma', 'mais', 'me', 'même', 'mes', 'moi', 'mon', 'ne', 'nos', 'notre', 'nous', 'on', 'ou', 'par', 'pas', 'pour', 'qu', 'que', 'qui', 'sa', 'se', 'ses', 'son', 'sur', 'ta', 'te', 'tes', 'toi', 'ton', 'tu', 'un', 'une', 'vos', 'votre', 'vous', 'c', 'd', 'j', 'l', 'à', 'm', 'n', 's', 't', 'y', 'été', 'étée', 'étées', 'étés', 'étant', 'étante', 'étants', 'étantes', 'suis', 'es', 'est', 'sommes', 'êtes', 'sont', 'serai', 'seras', 'sera', 'serons', 'serez', 'seront', 'serais', 'serait', 'serions', 'seriez', 'seraient', 'étais', 'était', 'étions', 'étiez', 'étaient', 'fus', 'fut', 'fûmes', 'fûtes', 'furent', 'sois', 'soit', 'soyons', 'soyez', 'soient

Traceback (most recent call last):
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 949, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 288, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 380, in _score
    y_pred = method_caller(
        estimator,
    ...<2 lines>...
        pos_label=pos_label,
    )
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classi

[CV 2/5] END model=LogisticRegression(), model__C=0.1, model__solver=lbfgs, vectorizer=CountVectorizer(), vectorizer__max_df=0.9, vectorizer__min_df=5, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=['au', 'aux', 'avec', 'ce', 'ces', 'dans', 'de', 'des', 'du', 'elle', 'en', 'et', 'eux', 'il', 'ils', 'je', 'la', 'le', 'les', 'leur', 'lui', 'ma', 'mais', 'me', 'même', 'mes', 'moi', 'mon', 'ne', 'nos', 'notre', 'nous', 'on', 'ou', 'par', 'pas', 'pour', 'qu', 'que', 'qui', 'sa', 'se', 'ses', 'son', 'sur', 'ta', 'te', 'tes', 'toi', 'ton', 'tu', 'un', 'une', 'vos', 'votre', 'vous', 'c', 'd', 'j', 'l', 'à', 'm', 'n', 's', 't', 'y', 'été', 'étée', 'étées', 'étés', 'étant', 'étante', 'étants', 'étantes', 'suis', 'es', 'est', 'sommes', 'êtes', 'sont', 'serai', 'seras', 'sera', 'serons', 'serez', 'seront', 'serais', 'serait', 'serions', 'seriez', 'seraient', 'étais', 'était', 'étions', 'étiez', 'étaient', 'fus', 'fut', 'fûmes', 'fûtes', 'furent', 'sois', 'soit', 'soyons', 'soyez', 'soient

Traceback (most recent call last):
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 949, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 288, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 380, in _score
    y_pred = method_caller(
        estimator,
    ...<2 lines>...
        pos_label=pos_label,
    )
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classi

[CV 3/5] END model=LogisticRegression(), model__C=0.1, model__solver=lbfgs, vectorizer=CountVectorizer(), vectorizer__max_df=0.9, vectorizer__min_df=5, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=['au', 'aux', 'avec', 'ce', 'ces', 'dans', 'de', 'des', 'du', 'elle', 'en', 'et', 'eux', 'il', 'ils', 'je', 'la', 'le', 'les', 'leur', 'lui', 'ma', 'mais', 'me', 'même', 'mes', 'moi', 'mon', 'ne', 'nos', 'notre', 'nous', 'on', 'ou', 'par', 'pas', 'pour', 'qu', 'que', 'qui', 'sa', 'se', 'ses', 'son', 'sur', 'ta', 'te', 'tes', 'toi', 'ton', 'tu', 'un', 'une', 'vos', 'votre', 'vous', 'c', 'd', 'j', 'l', 'à', 'm', 'n', 's', 't', 'y', 'été', 'étée', 'étées', 'étés', 'étant', 'étante', 'étants', 'étantes', 'suis', 'es', 'est', 'sommes', 'êtes', 'sont', 'serai', 'seras', 'sera', 'serons', 'serez', 'seront', 'serais', 'serait', 'serions', 'seriez', 'seraient', 'étais', 'était', 'étions', 'étiez', 'étaient', 'fus', 'fut', 'fûmes', 'fûtes', 'furent', 'sois', 'soit', 'soyons', 'soyez', 'soient

Traceback (most recent call last):
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 949, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 288, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 380, in _score
    y_pred = method_caller(
        estimator,
    ...<2 lines>...
        pos_label=pos_label,
    )
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classi

[CV 4/5] END model=LogisticRegression(), model__C=0.1, model__solver=lbfgs, vectorizer=CountVectorizer(), vectorizer__max_df=0.9, vectorizer__min_df=5, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=['au', 'aux', 'avec', 'ce', 'ces', 'dans', 'de', 'des', 'du', 'elle', 'en', 'et', 'eux', 'il', 'ils', 'je', 'la', 'le', 'les', 'leur', 'lui', 'ma', 'mais', 'me', 'même', 'mes', 'moi', 'mon', 'ne', 'nos', 'notre', 'nous', 'on', 'ou', 'par', 'pas', 'pour', 'qu', 'que', 'qui', 'sa', 'se', 'ses', 'son', 'sur', 'ta', 'te', 'tes', 'toi', 'ton', 'tu', 'un', 'une', 'vos', 'votre', 'vous', 'c', 'd', 'j', 'l', 'à', 'm', 'n', 's', 't', 'y', 'été', 'étée', 'étées', 'étés', 'étant', 'étante', 'étants', 'étantes', 'suis', 'es', 'est', 'sommes', 'êtes', 'sont', 'serai', 'seras', 'sera', 'serons', 'serez', 'seront', 'serais', 'serait', 'serions', 'seriez', 'seraient', 'étais', 'était', 'étions', 'étiez', 'étaient', 'fus', 'fut', 'fûmes', 'fûtes', 'furent', 'sois', 'soit', 'soyons', 'soyez', 'soient

Traceback (most recent call last):
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 949, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 288, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 380, in _score
    y_pred = method_caller(
        estimator,
    ...<2 lines>...
        pos_label=pos_label,
    )
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classi

[CV 5/5] END model=LogisticRegression(), model__C=0.1, model__solver=lbfgs, vectorizer=CountVectorizer(), vectorizer__max_df=0.9, vectorizer__min_df=5, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=['au', 'aux', 'avec', 'ce', 'ces', 'dans', 'de', 'des', 'du', 'elle', 'en', 'et', 'eux', 'il', 'ils', 'je', 'la', 'le', 'les', 'leur', 'lui', 'ma', 'mais', 'me', 'même', 'mes', 'moi', 'mon', 'ne', 'nos', 'notre', 'nous', 'on', 'ou', 'par', 'pas', 'pour', 'qu', 'que', 'qui', 'sa', 'se', 'ses', 'son', 'sur', 'ta', 'te', 'tes', 'toi', 'ton', 'tu', 'un', 'une', 'vos', 'votre', 'vous', 'c', 'd', 'j', 'l', 'à', 'm', 'n', 's', 't', 'y', 'été', 'étée', 'étées', 'étés', 'étant', 'étante', 'étants', 'étantes', 'suis', 'es', 'est', 'sommes', 'êtes', 'sont', 'serai', 'seras', 'sera', 'serons', 'serez', 'seront', 'serais', 'serait', 'serions', 'seriez', 'seraient', 'étais', 'était', 'étions', 'étiez', 'étaient', 'fus', 'fut', 'fûmes', 'fûtes', 'furent', 'sois', 'soit', 'soyons', 'soyez', 'soient

Traceback (most recent call last):
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 949, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 288, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 380, in _score
    y_pred = method_caller(
        estimator,
    ...<2 lines>...
        pos_label=pos_label,
    )
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classi

[CV 1/5] END model=LogisticRegression(), model__C=0.1, model__solver=lbfgs, vectorizer=CountVectorizer(), vectorizer__max_df=0.9, vectorizer__min_df=5, vectorizer__ngram_range=(1, 2), vectorizer__stop_words=['au', 'aux', 'avec', 'ce', 'ces', 'dans', 'de', 'des', 'du', 'elle', 'en', 'et', 'eux', 'il', 'ils', 'je', 'la', 'le', 'les', 'leur', 'lui', 'ma', 'mais', 'me', 'même', 'mes', 'moi', 'mon', 'ne', 'nos', 'notre', 'nous', 'on', 'ou', 'par', 'pas', 'pour', 'qu', 'que', 'qui', 'sa', 'se', 'ses', 'son', 'sur', 'ta', 'te', 'tes', 'toi', 'ton', 'tu', 'un', 'une', 'vos', 'votre', 'vous', 'c', 'd', 'j', 'l', 'à', 'm', 'n', 's', 't', 'y', 'été', 'étée', 'étées', 'étés', 'étant', 'étante', 'étants', 'étantes', 'suis', 'es', 'est', 'sommes', 'êtes', 'sont', 'serai', 'seras', 'sera', 'serons', 'serez', 'seront', 'serais', 'serait', 'serions', 'seriez', 'seraient', 'étais', 'était', 'étions', 'étiez', 'étaient', 'fus', 'fut', 'fûmes', 'fûtes', 'furent', 'sois', 'soit', 'soyons', 'soyez', 'soient

Traceback (most recent call last):
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 949, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 288, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 380, in _score
    y_pred = method_caller(
        estimator,
    ...<2 lines>...
        pos_label=pos_label,
    )
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classi

[CV 2/5] END model=LogisticRegression(), model__C=0.1, model__solver=lbfgs, vectorizer=CountVectorizer(), vectorizer__max_df=0.9, vectorizer__min_df=5, vectorizer__ngram_range=(1, 2), vectorizer__stop_words=['au', 'aux', 'avec', 'ce', 'ces', 'dans', 'de', 'des', 'du', 'elle', 'en', 'et', 'eux', 'il', 'ils', 'je', 'la', 'le', 'les', 'leur', 'lui', 'ma', 'mais', 'me', 'même', 'mes', 'moi', 'mon', 'ne', 'nos', 'notre', 'nous', 'on', 'ou', 'par', 'pas', 'pour', 'qu', 'que', 'qui', 'sa', 'se', 'ses', 'son', 'sur', 'ta', 'te', 'tes', 'toi', 'ton', 'tu', 'un', 'une', 'vos', 'votre', 'vous', 'c', 'd', 'j', 'l', 'à', 'm', 'n', 's', 't', 'y', 'été', 'étée', 'étées', 'étés', 'étant', 'étante', 'étants', 'étantes', 'suis', 'es', 'est', 'sommes', 'êtes', 'sont', 'serai', 'seras', 'sera', 'serons', 'serez', 'seront', 'serais', 'serait', 'serions', 'seriez', 'seraient', 'étais', 'était', 'étions', 'étiez', 'étaient', 'fus', 'fut', 'fûmes', 'fûtes', 'furent', 'sois', 'soit', 'soyons', 'soyez', 'soient

Traceback (most recent call last):
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 949, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 288, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 380, in _score
    y_pred = method_caller(
        estimator,
    ...<2 lines>...
        pos_label=pos_label,
    )
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classi

[CV 4/5] END model=LogisticRegression(), model__C=0.1, model__solver=lbfgs, vectorizer=CountVectorizer(), vectorizer__max_df=0.9, vectorizer__min_df=5, vectorizer__ngram_range=(1, 2), vectorizer__stop_words=['au', 'aux', 'avec', 'ce', 'ces', 'dans', 'de', 'des', 'du', 'elle', 'en', 'et', 'eux', 'il', 'ils', 'je', 'la', 'le', 'les', 'leur', 'lui', 'ma', 'mais', 'me', 'même', 'mes', 'moi', 'mon', 'ne', 'nos', 'notre', 'nous', 'on', 'ou', 'par', 'pas', 'pour', 'qu', 'que', 'qui', 'sa', 'se', 'ses', 'son', 'sur', 'ta', 'te', 'tes', 'toi', 'ton', 'tu', 'un', 'une', 'vos', 'votre', 'vous', 'c', 'd', 'j', 'l', 'à', 'm', 'n', 's', 't', 'y', 'été', 'étée', 'étées', 'étés', 'étant', 'étante', 'étants', 'étantes', 'suis', 'es', 'est', 'sommes', 'êtes', 'sont', 'serai', 'seras', 'sera', 'serons', 'serez', 'seront', 'serais', 'serait', 'serions', 'seriez', 'seraient', 'étais', 'était', 'étions', 'étiez', 'étaient', 'fus', 'fut', 'fûmes', 'fûtes', 'furent', 'sois', 'soit', 'soyons', 'soyez', 'soient

Traceback (most recent call last):
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 949, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 288, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 380, in _score
    y_pred = method_caller(
        estimator,
    ...<2 lines>...
        pos_label=pos_label,
    )
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classi

[CV 5/5] END model=LogisticRegression(), model__C=0.1, model__solver=lbfgs, vectorizer=CountVectorizer(), vectorizer__max_df=0.9, vectorizer__min_df=5, vectorizer__ngram_range=(1, 2), vectorizer__stop_words=['au', 'aux', 'avec', 'ce', 'ces', 'dans', 'de', 'des', 'du', 'elle', 'en', 'et', 'eux', 'il', 'ils', 'je', 'la', 'le', 'les', 'leur', 'lui', 'ma', 'mais', 'me', 'même', 'mes', 'moi', 'mon', 'ne', 'nos', 'notre', 'nous', 'on', 'ou', 'par', 'pas', 'pour', 'qu', 'que', 'qui', 'sa', 'se', 'ses', 'son', 'sur', 'ta', 'te', 'tes', 'toi', 'ton', 'tu', 'un', 'une', 'vos', 'votre', 'vous', 'c', 'd', 'j', 'l', 'à', 'm', 'n', 's', 't', 'y', 'été', 'étée', 'étées', 'étés', 'étant', 'étante', 'étants', 'étantes', 'suis', 'es', 'est', 'sommes', 'êtes', 'sont', 'serai', 'seras', 'sera', 'serons', 'serez', 'seront', 'serais', 'serait', 'serions', 'seriez', 'seraient', 'étais', 'était', 'étions', 'étiez', 'étaient', 'fus', 'fut', 'fûmes', 'fûtes', 'furent', 'sois', 'soit', 'soyons', 'soyez', 'soient

Traceback (most recent call last):
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 949, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 288, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 380, in _score
    y_pred = method_caller(
        estimator,
    ...<2 lines>...
        pos_label=pos_label,
    )
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classi

[CV 1/5] END model=LogisticRegression(), model__C=1, model__solver=lbfgs, vectorizer=CountVectorizer(), vectorizer__max_df=0.8, vectorizer__min_df=1, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=['au', 'aux', 'avec', 'ce', 'ces', 'dans', 'de', 'des', 'du', 'elle', 'en', 'et', 'eux', 'il', 'ils', 'je', 'la', 'le', 'les', 'leur', 'lui', 'ma', 'mais', 'me', 'même', 'mes', 'moi', 'mon', 'ne', 'nos', 'notre', 'nous', 'on', 'ou', 'par', 'pas', 'pour', 'qu', 'que', 'qui', 'sa', 'se', 'ses', 'son', 'sur', 'ta', 'te', 'tes', 'toi', 'ton', 'tu', 'un', 'une', 'vos', 'votre', 'vous', 'c', 'd', 'j', 'l', 'à', 'm', 'n', 's', 't', 'y', 'été', 'étée', 'étées', 'étés', 'étant', 'étante', 'étants', 'étantes', 'suis', 'es', 'est', 'sommes', 'êtes', 'sont', 'serai', 'seras', 'sera', 'serons', 'serez', 'seront', 'serais', 'serait', 'serions', 'seriez', 'seraient', 'étais', 'était', 'étions', 'étiez', 'étaient', 'fus', 'fut', 'fûmes', 'fûtes', 'furent', 'sois', 'soit', 'soyons', 'soyez', 'soient',

Traceback (most recent call last):
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 949, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 288, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 380, in _score
    y_pred = method_caller(
        estimator,
    ...<2 lines>...
        pos_label=pos_label,
    )
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classi

[CV 2/5] END model=LogisticRegression(), model__C=1, model__solver=lbfgs, vectorizer=CountVectorizer(), vectorizer__max_df=0.8, vectorizer__min_df=1, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=['au', 'aux', 'avec', 'ce', 'ces', 'dans', 'de', 'des', 'du', 'elle', 'en', 'et', 'eux', 'il', 'ils', 'je', 'la', 'le', 'les', 'leur', 'lui', 'ma', 'mais', 'me', 'même', 'mes', 'moi', 'mon', 'ne', 'nos', 'notre', 'nous', 'on', 'ou', 'par', 'pas', 'pour', 'qu', 'que', 'qui', 'sa', 'se', 'ses', 'son', 'sur', 'ta', 'te', 'tes', 'toi', 'ton', 'tu', 'un', 'une', 'vos', 'votre', 'vous', 'c', 'd', 'j', 'l', 'à', 'm', 'n', 's', 't', 'y', 'été', 'étée', 'étées', 'étés', 'étant', 'étante', 'étants', 'étantes', 'suis', 'es', 'est', 'sommes', 'êtes', 'sont', 'serai', 'seras', 'sera', 'serons', 'serez', 'seront', 'serais', 'serait', 'serions', 'seriez', 'seraient', 'étais', 'était', 'étions', 'étiez', 'étaient', 'fus', 'fut', 'fûmes', 'fûtes', 'furent', 'sois', 'soit', 'soyons', 'soyez', 'soient',

Traceback (most recent call last):
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 949, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 288, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 380, in _score
    y_pred = method_caller(
        estimator,
    ...<2 lines>...
        pos_label=pos_label,
    )
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classi

[CV 3/5] END model=LogisticRegression(), model__C=1, model__solver=lbfgs, vectorizer=CountVectorizer(), vectorizer__max_df=0.8, vectorizer__min_df=1, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=['au', 'aux', 'avec', 'ce', 'ces', 'dans', 'de', 'des', 'du', 'elle', 'en', 'et', 'eux', 'il', 'ils', 'je', 'la', 'le', 'les', 'leur', 'lui', 'ma', 'mais', 'me', 'même', 'mes', 'moi', 'mon', 'ne', 'nos', 'notre', 'nous', 'on', 'ou', 'par', 'pas', 'pour', 'qu', 'que', 'qui', 'sa', 'se', 'ses', 'son', 'sur', 'ta', 'te', 'tes', 'toi', 'ton', 'tu', 'un', 'une', 'vos', 'votre', 'vous', 'c', 'd', 'j', 'l', 'à', 'm', 'n', 's', 't', 'y', 'été', 'étée', 'étées', 'étés', 'étant', 'étante', 'étants', 'étantes', 'suis', 'es', 'est', 'sommes', 'êtes', 'sont', 'serai', 'seras', 'sera', 'serons', 'serez', 'seront', 'serais', 'serait', 'serions', 'seriez', 'seraient', 'étais', 'était', 'étions', 'étiez', 'étaient', 'fus', 'fut', 'fûmes', 'fûtes', 'furent', 'sois', 'soit', 'soyons', 'soyez', 'soient',

Traceback (most recent call last):
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 949, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 288, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 380, in _score
    y_pred = method_caller(
        estimator,
    ...<2 lines>...
        pos_label=pos_label,
    )
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classi

[CV 4/5] END model=LogisticRegression(), model__C=1, model__solver=lbfgs, vectorizer=CountVectorizer(), vectorizer__max_df=0.8, vectorizer__min_df=1, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=['au', 'aux', 'avec', 'ce', 'ces', 'dans', 'de', 'des', 'du', 'elle', 'en', 'et', 'eux', 'il', 'ils', 'je', 'la', 'le', 'les', 'leur', 'lui', 'ma', 'mais', 'me', 'même', 'mes', 'moi', 'mon', 'ne', 'nos', 'notre', 'nous', 'on', 'ou', 'par', 'pas', 'pour', 'qu', 'que', 'qui', 'sa', 'se', 'ses', 'son', 'sur', 'ta', 'te', 'tes', 'toi', 'ton', 'tu', 'un', 'une', 'vos', 'votre', 'vous', 'c', 'd', 'j', 'l', 'à', 'm', 'n', 's', 't', 'y', 'été', 'étée', 'étées', 'étés', 'étant', 'étante', 'étants', 'étantes', 'suis', 'es', 'est', 'sommes', 'êtes', 'sont', 'serai', 'seras', 'sera', 'serons', 'serez', 'seront', 'serais', 'serait', 'serions', 'seriez', 'seraient', 'étais', 'était', 'étions', 'étiez', 'étaient', 'fus', 'fut', 'fûmes', 'fûtes', 'furent', 'sois', 'soit', 'soyons', 'soyez', 'soient',

Traceback (most recent call last):
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 949, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 288, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 380, in _score
    y_pred = method_caller(
        estimator,
    ...<2 lines>...
        pos_label=pos_label,
    )
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classi

[CV 5/5] END model=LogisticRegression(), model__C=1, model__solver=lbfgs, vectorizer=CountVectorizer(), vectorizer__max_df=0.8, vectorizer__min_df=1, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=['au', 'aux', 'avec', 'ce', 'ces', 'dans', 'de', 'des', 'du', 'elle', 'en', 'et', 'eux', 'il', 'ils', 'je', 'la', 'le', 'les', 'leur', 'lui', 'ma', 'mais', 'me', 'même', 'mes', 'moi', 'mon', 'ne', 'nos', 'notre', 'nous', 'on', 'ou', 'par', 'pas', 'pour', 'qu', 'que', 'qui', 'sa', 'se', 'ses', 'son', 'sur', 'ta', 'te', 'tes', 'toi', 'ton', 'tu', 'un', 'une', 'vos', 'votre', 'vous', 'c', 'd', 'j', 'l', 'à', 'm', 'n', 's', 't', 'y', 'été', 'étée', 'étées', 'étés', 'étant', 'étante', 'étants', 'étantes', 'suis', 'es', 'est', 'sommes', 'êtes', 'sont', 'serai', 'seras', 'sera', 'serons', 'serez', 'seront', 'serais', 'serait', 'serions', 'seriez', 'seraient', 'étais', 'était', 'étions', 'étiez', 'étaient', 'fus', 'fut', 'fûmes', 'fûtes', 'furent', 'sois', 'soit', 'soyons', 'soyez', 'soient',

KeyboardInterrupt: 

## 4. Externalization of Vectorization: Vectorizers (Count & TF-IDF), Models (Log Reg, SVM, Xgboost, RF), GridSearch And Checkpoints

### Vectorization

In [16]:
# Directory to save vectorized data and checkpoints
vectorized_data_dir = "vectorized_data"
checkpoint_dir = "checkpoints"
os.makedirs(vectorized_data_dir, exist_ok=True)
os.makedirs(checkpoint_dir, exist_ok=True)

# Precompute and save vectorized data
print("Vectorizing data...")
french_stop_words = stopwords.words('french')

# CountVectorizer
count_vectorizer = CountVectorizer(
    stop_words=french_stop_words,
    ngram_range=(1, 2),  # Unigrams and bigrams
    max_df=0.8,
    min_df=5
)
X_train_count = count_vectorizer.fit_transform(X_train)
X_test_count = count_vectorizer.transform(X_test)
joblib.dump(X_train_count, os.path.join(vectorized_data_dir, "X_train_count.pkl"))
joblib.dump(X_test_count, os.path.join(vectorized_data_dir, "X_test_count.pkl"))
joblib.dump(count_vectorizer, os.path.join(vectorized_data_dir, "count_vectorizer.pkl"))

# TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(
    stop_words=french_stop_words,
    ngram_range=(1, 2),  # Unigrams and bigrams
    max_df=0.8,
    min_df=5
)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)
joblib.dump(X_train_tfidf, os.path.join(vectorized_data_dir, "X_train_tfidf.pkl"))
joblib.dump(X_test_tfidf, os.path.join(vectorized_data_dir, "X_test_tfidf.pkl"))
joblib.dump(tfidf_vectorizer, os.path.join(vectorized_data_dir, "tfidf_vectorizer.pkl"))

print("Vectorization complete and data saved.")

Vectorizing data...
Vectorization complete and data saved.


### Paramter Search

In [19]:
#239 Minutes 22.04.2025


# Define the parameter grid
param_grid = [
    # Logistic Regression with CountVectorizer
    {
        'vectorizer_type': ['count'],  # Use precomputed CountVectorizer data
        'classifier': [LogisticRegression()],
        'classifier__C': [0.1, 1, 10],
        'classifier__solver': ['lbfgs']
    },
    # Logistic Regression with TfidfVectorizer
    {
        'vectorizer_type': ['tfidf'],  # Use precomputed TfidfVectorizer data
        'classifier': [LogisticRegression()],
        'classifier__C': [0.1, 1, 10],
        'classifier__solver': ['lbfgs']
    },
    # SVM with CountVectorizer
    {
        'vectorizer_type': ['count'],
        'classifier': [SVC()],
        'classifier__C': [0.1, 1, 10],
        'classifier__kernel': ['linear', 'rbf']
    },
    # SVM with TfidfVectorizer
    {
        'vectorizer_type': ['tfidf'],
        'classifier': [SVC()],
        'classifier__C': [0.1, 1, 10],
        'classifier__kernel': ['linear', 'rbf']
    },
    # XGBoost with CountVectorizer
    {
        'vectorizer_type': ['count'],
        'classifier': [XGBClassifier()],
        'classifier__n_estimators': [100, 200],
        'classifier__learning_rate': [0.01, 0.1, 0.2],
        'classifier__max_depth': [3, 5, 7]
    },
    # XGBoost with TfidfVectorizer
    {
        'vectorizer_type': ['tfidf'],
        'classifier': [XGBClassifier()],
        'classifier__n_estimators': [100, 200],
        'classifier__learning_rate': [0.01, 0.1, 0.2],
        'classifier__max_depth': [3, 5, 7]
    },
    # Random Forest with CountVectorizer
    {
        'vectorizer_type': ['count'],
        'classifier': [RandomForestClassifier()],
        'classifier__n_estimators': [100, 200],
        'classifier__max_depth': [None, 10, 20]
    },
    # Random Forest with TfidfVectorizer
    {
        'vectorizer_type': ['tfidf'],
        'classifier': [RandomForestClassifier()],
        'classifier__n_estimators': [100, 200],
        'classifier__max_depth': [None, 10, 20]
    }
]

# Load progress log if it exists
log_file = os.path.join(checkpoint_dir, "progress_log.json")
if os.path.exists(log_file):
    with open(log_file, "r") as f:
        completed_params = json.load(f)
else:
    completed_params = []

# Iterate over parameter combinations
for i, params in enumerate(param_grid):
    checkpoint_file = os.path.join(checkpoint_dir, f"checkpoint_{i}.pkl")
    
    # Skip if this parameter combination has already been processed
    if i in completed_params:
        print(f"Skipping parameter combination {i}, already completed.")
        continue

    try:
        print(f"Processing parameter combination {i}...")

        # Load the appropriate vectorized data based on vectorizer_type
        vectorizer_type = params.get('vectorizer_type', [None])[0]  # Extract the first element of the list
        if vectorizer_type == 'count':
            X_train_vec = joblib.load(os.path.join(vectorized_data_dir, "X_train_count.pkl"))
            X_test_vec = joblib.load(os.path.join(vectorized_data_dir, "X_test_count.pkl"))
        elif vectorizer_type == 'tfidf':
            X_train_vec = joblib.load(os.path.join(vectorized_data_dir, "X_train_tfidf.pkl"))
            X_test_vec = joblib.load(os.path.join(vectorized_data_dir, "X_test_tfidf.pkl"))
        else:
            raise ValueError(f"Unknown vectorizer type: {vectorizer_type}")

        # Define the pipeline
        pipeline = Pipeline([
            ('classifier', params['classifier'][0])  # Dynamically set the classifier
        ])

        # Define the weighted F1 scorer
        weighted_f1_scorer = make_scorer(f1_score, average='weighted')
        
        # Remove 'classifier' and 'vectorizer_type' keys from params for GridSearchCV
        grid_params = {k: v for k, v in params.items() if k not in ['classifier', 'vectorizer_type']}

        # Perform grid search
        grid_search = GridSearchCV(
            pipeline,
            [grid_params],  # Pass the adjusted parameter grid
            cv=5, scoring=weighted_f1_scorer, verbose=1, n_jobs=5
        )
        grid_search.fit(X_train_vec, y_train)
        
        # Save the best model for this parameter combination
        joblib.dump(grid_search.best_estimator_, checkpoint_file)
        print(f"Saved checkpoint for parameter combination {i}.")
        
        # Print intermediate results (weighted F1 score)
        print(f"Weighted F1 Score for parameter combination {i}: {grid_search.best_score_}")
        
        # Update progress log
        completed_params.append(i)
        with open(log_file, "w") as f:
            json.dump(completed_params, f)
    except Exception as e:
        print(f"Error with parameter combination {i}: {e}")

# Evaluate the best model on the test set
if not completed_params:
    print("No parameter combinations were successfully processed. Skipping evaluation.")
else:
    best_model_file = os.path.join(checkpoint_dir, f"checkpoint_{completed_params[-1]}.pkl")
    best_model = joblib.load(best_model_file)
    y_pred = best_model.predict(X_test_vec)
    weighted_f1 = f1_score(y_test, y_pred, average='weighted')
    print("Test Set Weighted F1-Score:", weighted_f1)

    # Optional: Detailed classification report
    print(classification_report(y_test, y_pred))

Processing parameter combination 0...
Fitting 5 folds for each of 3 candidates, totalling 15 fits


Traceback (most recent call last):
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 949, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 288, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 380, in _score
    y_pred = method_caller(
        estimator,
    ...<2 lines>...
        pos_label=pos_label,
    )
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classi

Saved checkpoint for parameter combination 0.
Weighted F1 Score for parameter combination 0: nan
Processing parameter combination 1...
Fitting 5 folds for each of 3 candidates, totalling 15 fits


Traceback (most recent call last):
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 949, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 288, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 380, in _score
    y_pred = method_caller(
        estimator,
    ...<2 lines>...
        pos_label=pos_label,
    )
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classi

Saved checkpoint for parameter combination 1.
Weighted F1 Score for parameter combination 1: nan
Processing parameter combination 2...
Fitting 5 folds for each of 6 candidates, totalling 30 fits


Traceback (most recent call last):
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 949, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 288, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 380, in _score
    y_pred = method_caller(
        estimator,
    ...<2 lines>...
        pos_label=pos_label,
    )
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classi

Saved checkpoint for parameter combination 2.
Weighted F1 Score for parameter combination 2: nan
Processing parameter combination 3...
Fitting 5 folds for each of 6 candidates, totalling 30 fits


Traceback (most recent call last):
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 949, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 288, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 380, in _score
    y_pred = method_caller(
        estimator,
    ...<2 lines>...
        pos_label=pos_label,
    )
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classi

Saved checkpoint for parameter combination 3.
Weighted F1 Score for parameter combination 3: nan
Processing parameter combination 4...
Fitting 5 folds for each of 18 candidates, totalling 90 fits
Error with parameter combination 4: 
All the 90 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
90 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/

Traceback (most recent call last):
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 949, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 288, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 380, in _score
    y_pred = method_caller(
        estimator,
    ...<2 lines>...
        pos_label=pos_label,
    )
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classi

Saved checkpoint for parameter combination 6.
Weighted F1 Score for parameter combination 6: nan
Processing parameter combination 7...
Fitting 5 folds for each of 6 candidates, totalling 30 fits


Traceback (most recent call last):
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 949, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 288, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classification-of-rakuten-e-commerce-products/myenv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 380, in _score
    y_pred = method_caller(
        estimator,
    ...<2 lines>...
        pos_label=pos_label,
    )
  File "/Users/robertwilson/Documents/GitHub/feb25_bds_classi

Saved checkpoint for parameter combination 7.
Weighted F1 Score for parameter combination 7: nan
Test Set Weighted F1-Score: 0.7776850467815705
              precision    recall  f1-score   support

          10       0.43      0.60      0.50       686
          40       0.71      0.68      0.69       537
          50       0.85      0.70      0.76       345
          60       0.98      0.86      0.92       166
        1140       0.75      0.81      0.78       590
        1160       0.90      0.92      0.91       769
        1180       0.80      0.53      0.64       162
        1280       0.66      0.57      0.61       981
        1281       0.63      0.55      0.59       399
        1300       0.82      0.91      0.86      1032
        1301       0.98      0.87      0.92       148
        1302       0.86      0.60      0.71       471
        1320       0.80      0.72      0.76       662
        1560       0.71      0.78      0.74      1029
        1920       0.90      0.92      0.91  

In [20]:
# Get all parameters of the pipeline
pipeline_params = best_model.get_params()
print("Pipeline Parameters:", pipeline_params)

Pipeline Parameters: {'memory': None, 'steps': [('classifier', RandomForestClassifier())], 'transform_input': None, 'verbose': False, 'classifier': RandomForestClassifier(), 'classifier__bootstrap': True, 'classifier__ccp_alpha': 0.0, 'classifier__class_weight': None, 'classifier__criterion': 'gini', 'classifier__max_depth': None, 'classifier__max_features': 'sqrt', 'classifier__max_leaf_nodes': None, 'classifier__max_samples': None, 'classifier__min_impurity_decrease': 0.0, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__min_weight_fraction_leaf': 0.0, 'classifier__monotonic_cst': None, 'classifier__n_estimators': 100, 'classifier__n_jobs': None, 'classifier__oob_score': False, 'classifier__random_state': None, 'classifier__verbose': 0, 'classifier__warm_start': False}


In [21]:
print(grid_search.best_params_)

{'classifier__max_depth': None, 'classifier__n_estimators': 100}


In [22]:
best_model = joblib.load("checkpoints/checkpoint_0.pkl")
print(best_model)

Pipeline(steps=[('classifier', LogisticRegression(C=0.1))])


Delete Vectorized Data if needed (to re-run with new input file)

In [8]:
# Delete existing vectorized data
for file_name in ["X_train_count.pkl", "X_test_count.pkl", "X_train_tfidf.pkl", "X_test_tfidf.pkl", 
                  "count_vectorizer.pkl", "tfidf_vectorizer.pkl"]:
    file_path = os.path.join(vectorized_data_dir, file_name)
    if os.path.exists(file_path):
        os.remove(file_path)
        print(f"Deleted old vectorized data: {file_name}")

NameError: name 'vectorized_data_dir' is not defined

Delete all checkpoints (to re-run from scratch)

In [58]:
# Directory where checkpoints and progress logs are saved
checkpoint_dir = "checkpoints"

# Delete all files in the checkpoints directory
if os.path.exists(checkpoint_dir):
    shutil.rmtree(checkpoint_dir)  # Remove the entire directory
    print(f"Deleted all files in the '{checkpoint_dir}' directory.")

# Recreate the empty checkpoints directory
os.makedirs(checkpoint_dir, exist_ok=True)
print(f"Recreated the empty '{checkpoint_dir}' directory.")

Deleted all files in the 'checkpoints' directory.
Recreated the empty 'checkpoints' directory.
