2. SGDCClassifier (Approach 1)

In [1]:
import pandas as pd
import numpy as np  
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.utils import resample
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, accuracy_score

In [2]:
input_path = r"C:\Users\Dell\Documents\final_processed_dataframe.pkl"

# Load the dataframe back
df = pd.read_pickle(input_path, compression="gzip")

print("DataFrame loaded successfully!")
print(df.shape)

DataFrame loaded successfully!
(2680364, 17)


In [3]:
df.columns

Index(['example_id', 'query', 'query_id', 'product_id', 'product_locale_x',
       'esci_label', 'small_version', 'large_version', 'split',
       'product_title', 'product_description', 'product_bullet_point',
       'product_brand', 'product_color', 'product_locale_y', 'product_text',
       'lemmatized_text'],
      dtype='object')

Stratified Sampling - 40% : 40% : 20% split

In [4]:
# First split off 20% for final testing
df_temp, df_test = train_test_split(
    df,
    test_size=0.2,
    stratify=df['esci_label'],
    random_state=42
)

In [5]:
# Now split remaining 80% into 40% + 40%
df_train1, df_train2 = train_test_split(
    df_temp,
    test_size=0.5,
    stratify=df_temp['esci_label'],
    random_state=43
)

In [6]:
print(f"Batch Sizes:\nTrain1: {df_train1.shape}\nTrain2: {df_train2.shape}\nTest: {df_test.shape}")

Batch Sizes:
Train1: (1072145, 17)
Train2: (1072146, 17)
Test: (536073, 17)


In [10]:
# Creating the pipeline - Tfidf + SGDClassifier
model = Pipeline([
    ('tfidf', TfidfVectorizer(
        max_features=100000,
        ngram_range=(1, 2),
        sublinear_tf=True
    )),
    ('clf', SGDClassifier(
        loss='log_loss',             # Logistic regression objective
        penalty='l2',
        alpha=1e-4,                  # ~ inverse of C
        class_weight='balanced',
        max_iter=100,
        tol=1e-3,
        n_jobs=-1,
        random_state=42
    ))
])

In [11]:
# ---- 1st batch (fit from scratch) ----
X1, y1 = df_train1['lemmatized_text'], df_train1['esci_label']
model.fit(X1, y1)   # This fits both TF-IDF and classifier

0,1,2
,steps,"[('tfidf', ...), ('clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,loss,'log_loss'
,penalty,'l2'
,alpha,0.0001
,l1_ratio,0.15
,fit_intercept,True
,max_iter,100
,tol,0.001
,shuffle,True
,verbose,0
,epsilon,0.1


In [12]:
# ---- 2nd batch (incremental update) ----
X2, y2 = df_train2['lemmatized_text'], df_train2['esci_label']

# Vectorize with the existing TF-IDF vocabulary
X2_vec = model.named_steps['tfidf'].transform(X2)
clf = model.named_steps['clf']
clf.partial_fit(X2_vec, y2)

0,1,2
,loss,'log_loss'
,penalty,'l2'
,alpha,0.0001
,l1_ratio,0.15
,fit_intercept,True
,max_iter,100
,tol,0.001
,shuffle,True
,verbose,0
,epsilon,0.1


In [14]:
# Testing on the final 20%
X_test_vec = model.named_steps['tfidf'].transform(df_test['lemmatized_text'])
y_test = df_test['esci_label']

y_pred = clf.predict(X_test_vec)

In [15]:
y_pred = clf.predict(X_test_vec)

print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.638621978723047

Classification Report:
               precision    recall  f1-score   support

           C       0.13      0.14      0.14     15537
           E       0.65      0.97      0.78    348821
           I       0.55      0.01      0.02     53966
           S       0.33      0.00      0.00    117749

    accuracy                           0.64    536073
   macro avg       0.42      0.28      0.23    536073
weighted avg       0.56      0.64      0.52    536073



2. SGDCClassifier (Approach 2)

In [4]:
df["esci_label"].value_counts() 

esci_label
E    1744103
S     588743
I     269833
C      77685
Name: count, dtype: int64

In [3]:
# Dividing the Dataset on the basis of different esci_label
df1 = df[df['esci_label'] == 'E']
df2 = df[df['esci_label'] != 'E']
print(df1.shape)
print(df2.shape)

df1_train, df1_test = train_test_split(df1, test_size=0.2, random_state=42)
df2_train, df2_test = train_test_split(df2, test_size=0.2, stratify=df2['esci_label'], random_state=43)

# # Get class counts
# min_class_size = df2_train['esci_label'].value_counts().min()

# # Downsample E
# df1_train_down = resample(df1_train, replace=False, n_samples=min_class_size, random_state=42)
# df2_train_down = resample(df2_train, replace=False, n_samples=min_class_size, random_state=321)

# # Combine balanced data
# df_balanced = pd.concat([df1_train_down, df2_train_down])
# print(df_balanced['esci_label'].value_counts())


(1744103, 17)
(936261, 17)


In [24]:
a = df_balanced[df_balanced['esci_label'] =='C']
b = df_balanced[df_balanced['esci_label'] !='C']

max_class_size = b['esci_label'].value_counts().max()

a_up = resample(a, n_samples=max_class_size, random_state=42)

df_final = pd.concat([a_up, b])
print(df_final['esci_label'].value_counts())

esci_label
C    62148
E    62148
S    39198
I    17740
Name: count, dtype: int64


In [8]:
# Combine test sets
df_test = pd.concat([df1_test, df2_test])

In [28]:
model = Pipeline([
    ('tfidf', TfidfVectorizer(
        max_features=200000,
        ngram_range=(1, 3),
        sublinear_tf=True,
        analyzer='word'
    )),
    ('clf', SGDClassifier(
        loss='log_loss',
        alpha=1e-5,
        penalty='elasticnet',
        class_weight='balanced',
        max_iter=1000,
        random_state=42
    ))
])

In [29]:
model.fit(df_final['lemmatized_text'], df_final['esci_label'])

0,1,2
,steps,"[('tfidf', ...), ('clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,loss,'log_loss'
,penalty,'elasticnet'
,alpha,1e-05
,l1_ratio,0.15
,fit_intercept,True
,max_iter,1000
,tol,0.001
,shuffle,True
,verbose,0
,epsilon,0.1


In [30]:
y_pred = model.predict(df_test['lemmatized_text'])

print("\nAccuracy:", accuracy_score(df_test['esci_label'], y_pred))
print("\nClassification Report:\n", classification_report(df_test['esci_label'], y_pred))


Accuracy: 0.2502042628443088

Classification Report:
               precision    recall  f1-score   support

           C       0.03      0.73      0.07     15537
           E       0.72      0.31      0.43    348821
           I       0.19      0.08      0.11     53967
           S       0.28      0.09      0.14    117749

    accuracy                           0.25    536074
   macro avg       0.30      0.30      0.19    536074
weighted avg       0.55      0.25      0.32    536074



2. SGDCClassifier (Approach 3)

In [None]:
# ---- Balance Data ----
min_size = df2_train['esci_label'].value_counts().min()
df1_train_down = resample(df1_train, n_samples=min_size, random_state=42, replace=False)
df_balanced = pd.concat([df1_train_down, df2_train])

In [None]:
# ---- TF-IDF ----
tfidf = TfidfVectorizer(
    max_features=200000,
    ngram_range=(1, 3),
    sublinear_tf=True,
    min_df=3,
    max_df=0.9
)
X_train = tfidf.fit_transform(df['lemmatized_text'])
y_train = df['esci_label']

In [6]:
# ---- SGDClassifier ----
clf = SGDClassifier(
    loss='log_loss',
    penalty='elasticnet',
    alpha=1e-5,
    l1_ratio=0.15,
    max_iter=2000,
    tol=1e-4,
    learning_rate='optimal',
    class_weight='balanced',
    n_jobs=-1,
    random_state=42
)
clf.fit(X_train, y_train)

0,1,2
,loss,'log_loss'
,penalty,'elasticnet'
,alpha,1e-05
,l1_ratio,0.15
,fit_intercept,True
,max_iter,2000
,tol,0.0001
,shuffle,True
,verbose,0
,epsilon,0.1


In [9]:
# ---- Evaluation ----
X_test = tfidf.transform(df_test['lemmatized_text'])
y_pred = clf.predict(X_test)
print("\nAccuracy:", accuracy_score(df_test['esci_label'], y_pred))
print("\nClassification Report:\n", classification_report(df_test['esci_label'], y_pred))


Accuracy: 0.2262821923838873

Classification Report:
               precision    recall  f1-score   support

           C       0.11      0.30      0.16     15537
           E       0.79      0.02      0.03    348821
           I       0.22      0.07      0.11     53967
           S       0.23      0.91      0.36    117749

    accuracy                           0.23    536074
   macro avg       0.34      0.32      0.17    536074
weighted avg       0.59      0.23      0.12    536074



2. SGDCClassifier (Approach 4)

In [None]:
# Dividing the Dataset on the basis of different esci_label
df_E = df[df['esci_label'] == 'E']
df_SI = df[df['esci_label'].isin(['S', 'I'])]
df_C = df[df['esci_label'] == 'C']

print(df_E.shape)
print(df_SI.shape)  
print(df_C.shape)
print(df_E['esci_label'].value_counts())
print(df_SI['esci_label'].value_counts())
print(df_C['esci_label'].value_counts())

In [None]:
# Train-Test Split for each subset
df_E_train, df_E_test = train_test_split(df_E, test_size=0.4, random_state=42)
df_SI_train, df_SI_test = train_test_split(df_SI, test_size=0.2, stratify=df_SI['esci_label'], random_state=43)
df_C_train, df_C_test = train_test_split(df_C, test_size=0.2, random_state=44)

In [None]:
# Combine test sets
df_test = pd.concat([df_E_test, df_SI_test, df_C_test])

In [None]:
# Computing the class weights
classes = np.unique(df['esci_label'])
weights = compute_class_weight(class_weight='balanced', classes=classes, y=df['esci_label'])
class_weights_dict = dict(zip(classes, weights))

In [None]:
model = Pipeline([
    ('tfidf', TfidfVectorizer(
        max_features=150000,
        ngram_range=(1, 3),
        sublinear_tf=True,
        analyzer='word'
    )),
    ('clf', SGDClassifier(
        loss='log_loss',
        alpha=1e-5,
        penalty='elasticnet',
        class_weight=class_weights_dict,
        max_iter=600,
        random_state=42
    ))
])

In [None]:
corpus = pd.concat([df_E_train, df_SI_train, df_C_train])

In [None]:
tfidf = model.named_steps['tfidf']

tfidf.fit(corpus['lemmatized_text'])

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'


In [None]:
X1_vec = tfidf.transform(df_E_train['lemmatized_text'])
X2_vec = tfidf.transform(df_SI_train['lemmatized_text'])
X3_vec = tfidf.transform(df_C_train['lemmatized_text'])

In [None]:
clf = model.named_steps['clf']

# FIRST partial_fit — must include ALL classes
all_classes = np.unique(df['esci_label'])
clf.partial_fit(X3_vec, df_C_train['esci_label'], classes=all_classes)

0,1,2
,loss,'log_loss'
,penalty,'elasticnet'
,alpha,1e-05
,l1_ratio,0.15
,fit_intercept,True
,max_iter,600
,tol,0.001
,shuffle,True
,verbose,0
,epsilon,0.1


In [None]:
# SECOND partial_fit — no need to specify classes again
clf.partial_fit(X2_vec, df_SI_train['esci_label'])

0,1,2
,loss,'log_loss'
,penalty,'elasticnet'
,alpha,1e-05
,l1_ratio,0.15
,fit_intercept,True
,max_iter,600
,tol,0.001
,shuffle,True
,verbose,0
,epsilon,0.1


In [None]:
# THIRD partial_fit
clf.partial_fit(X1_vec, df_E_train['esci_label'])

0,1,2
,loss,'log_loss'
,penalty,'elasticnet'
,alpha,1e-05
,l1_ratio,0.15
,fit_intercept,True
,max_iter,600
,tol,0.001
,shuffle,True
,verbose,0
,epsilon,0.1


In [None]:
all_classes = np.unique(df['esci_label'])

for epoch in range(3):  # 3 passes for stability
    print(f"Epoch {epoch+1}/3")

    # 1. train small/minority class first
    clf.partial_fit(X3_vec, df_C_train['esci_label'], classes=all_classes)
    clf.partial_fit(X2_vec, df_SI_train['esci_label'])
    clf.partial_fit(X1_vec[:len(df_SI_train)], df_E_train['esci_label'].iloc[:len(df_SI_train)])

In [None]:
# Testing the model 
X_test = tfidf.transform(df_test['lemmatized_text'])
y_test = df_test['esci_label']

y_pred = clf.predict(X_test)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Classification Report:


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


              precision    recall  f1-score   support

           C       0.00      0.00      0.00     15537
           E       0.79      1.00      0.88    697642
           I       0.00      0.00      0.00     53967
           S       0.00      0.00      0.00    117749

    accuracy                           0.79    884895
   macro avg       0.20      0.25      0.22    884895
weighted avg       0.62      0.79      0.70    884895



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


2. SGDCClassifier (Approach 5)

In [91]:
# Dividing the Dataset on the basis of different esci_label
df_E = df[df['esci_label'] == 'E']
df_SI = df[df['esci_label'].isin(['S', 'I'])]
df_C = df[df['esci_label'] == 'C']

In [92]:
# Train-Test Split for each subset
df_E_train, df_E_test = train_test_split(df_E, test_size=0.2, random_state=42)
df_SI_train, df_SI_test = train_test_split(df_SI, test_size=0.2, stratify=df_SI['esci_label'], random_state=43)
df_C_train, df_C_test = train_test_split(df_C, test_size=0.2, random_state=44)

In [93]:
df_E_test, _ = train_test_split(df_E, test_size=0.95, random_state=42)

In [94]:
# Downsample larger classes to balance roughly
min_size = min(len(df_E_train), len(df_C_train), len(df_SI_train))
df_E_train_sample = resample(df_E_train, n_samples=min_size, random_state=42)
df_SI_train_sample = resample(df_SI_train, n_samples=min_size, random_state=42)
df_C_train_sample = resample(df_C_train, n_samples=min_size, random_state=42)

In [95]:
df_E_train_new = resample(df_E_train, n_samples=min_size*2, random_state=44)
df_SI_train_new = resample(df_SI_train, n_samples=min_size*2, random_state=44)

In [96]:
# Forming the corpus
corpus = pd.concat([df_E_train_sample, df_SI_train_sample, df_C_train_sample])

In [97]:
# Combine test sets
df_test = pd.concat([df_E_test, df_SI_test, df_C_test])

In [98]:
# Computing the class weights
classes = np.unique(df['esci_label'])
weights = compute_class_weight(class_weight='balanced', classes=classes, y=df['esci_label'])
class_weights_dict = dict(zip(classes, weights))

In [99]:
# Establishing the model pipeline
model = Pipeline([
    ('tfidf', TfidfVectorizer(
        max_features=100000,
        ngram_range=(1, 2),
        sublinear_tf=True,
        analyzer='word'
    )),
    ('clf', SGDClassifier(
        loss='log_loss',
        alpha=1e-6,
        penalty='elasticnet',
        class_weight=class_weights_dict,
        max_iter=2000,
        random_state=42
    ))
])

In [101]:
# Defining the classifier
clf = model.named_steps['clf']

In [104]:
# Incremental training with SGDClassifier
for epoch in range(3):
    print(f"Epoch {epoch+1}/3")

    # Shuffle and combine each time
    df_train_epoch = pd.concat([df_E_train, df_SI_train, df_C_train]).sample(frac=1, random_state=42+epoch)
    
    X_epoch = tfidf.transform(df_train_epoch['lemmatized_text'])
    y_epoch = df_train_epoch['esci_label']

    if epoch == 0:
        clf.partial_fit(X_epoch, y_epoch, classes=classes)
    else:
        clf.partial_fit(X_epoch, y_epoch)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [105]:
# Testing the model 
X_test = tfidf.transform(df_test['lemmatized_text'])
y_test = df_test['esci_label']

y_pred = clf.predict(X_test)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           C       0.30      0.24      0.27     15537
           E       0.33      0.93      0.49     87205
           I       0.44      0.08      0.14     53967
           S       0.63      0.05      0.09    117749

    accuracy                           0.35    274458
   macro avg       0.43      0.33      0.25    274458
weighted avg       0.48      0.35      0.24    274458

