In [1]:
import numpy as np
import pandas as pd
import spacy
import nltk
import sklearn
import re

# Tasks :
1) Load and preprocess the dataset (clean text, handle missing values).
2) Train a baseline model (e.g., Logistic Regression or LightGBM).
3) Fine-tune a small transformer model (e.g., distilbert-base-uncased) using Hugging Face.
4) Evaluate models using accuracy + F1 score.
5) Compare results and explain which model you’d use in production and why.


In [2]:
# load and preprocess the dataset(clean text, handle missing values).

data = pd.read_csv("reply_classification_dataset.csv")
print(data.head())

                                               reply     label
0                           Can we discuss pricing??   NEUTRAL
1  Im excited to explore this further, plz send c...  POSITIVE
2                We not looking for new solutions.    negative
3                 Could u clarify features included?   neutral
4           lets,, schedule a meeting to dive deeper  positive


In [3]:
# Shape of the data
data.shape

(2129, 2)

In [4]:
# Data Description
data.describe()

Unnamed: 0,reply,label
count,2129,2129
unique,321,9
top,Not open to vendors,neutral
freq,56,704


In [5]:
# Data information
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2129 entries, 0 to 2128
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   reply   2129 non-null   object
 1   label   2129 non-null   object
dtypes: object(2)
memory usage: 33.4+ KB


In [6]:
#Checking the null values
data.isnull().sum()

reply    0
label    0
dtype: int64

In [7]:
# Checking the value counts for the label
print(data['label'].value_counts())

label
neutral     704
positive    446
NEGATIVE    267
POSITIVE    263
Negative    254
negative    189
Neutral       3
NEUTRAL       2
Positive      1
Name: count, dtype: int64


In [8]:
#Preprocess the dataset
def clean_reply(reply):
    reply = reply.lower()                           # lowercase
    reply = re.sub(r"http\S+|www\S+", "", reply)    # remove URLs
    reply = re.sub(r"@\w+|#\w+", "", reply)         # remove mentions/hashtags
    reply = re.sub(r"[^a-z\s]", "", reply)          # remove punctuation/numbers
    reply = re.sub(r"\s+", " ", reply).strip()      # remove extra spaces
    return reply

data["clean_reply"] = data["reply"].apply(clean_reply)

In [9]:
data['label'] = data['label'].str.lower()

In [10]:
print(data['label'].unique())

['neutral' 'positive' 'negative']


In [11]:
data.head()

Unnamed: 0,reply,label,clean_reply
0,Can we discuss pricing??,neutral,can we discuss pricing
1,"Im excited to explore this further, plz send c...",positive,im excited to explore this further plz send co...
2,We not looking for new solutions.,negative,we not looking for new solutions
3,Could u clarify features included?,neutral,could u clarify features included
4,"lets,, schedule a meeting to dive deeper",positive,lets schedule a meeting to dive deeper


In [12]:
data['label'].value_counts()

label
positive    710
negative    710
neutral     709
Name: count, dtype: int64

In [13]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [14]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rache\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [15]:
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def tokenize_and_lemmatize(reply):
    tokens = nltk.word_tokenize(reply)  # tokenize
    tokens = [t for t in tokens if t not in stop_words]  # remove stopwords
    tokens = [lemmatizer.lemmatize(t) for t in tokens]   # lemmatization
    return tokens

data["tokens"] = data["clean_reply"].apply(tokenize_and_lemmatize)

In [16]:
data.head(10)

Unnamed: 0,reply,label,clean_reply,tokens
0,Can we discuss pricing??,neutral,can we discuss pricing,"[discus, pricing]"
1,"Im excited to explore this further, plz send c...",positive,im excited to explore this further plz send co...,"[im, excited, explore, plz, send, contract]"
2,We not looking for new solutions.,negative,we not looking for new solutions,"[looking, new, solution]"
3,Could u clarify features included?,neutral,could u clarify features included,"[could, u, clarify, feature, included]"
4,"lets,, schedule a meeting to dive deeper",positive,lets schedule a meeting to dive deeper,"[let, schedule, meeting, dive, deeper]"
5,Please remove me from list,negative,please remove me from list,"[please, remove, list]"
6,"This looks promising, send specs!!",positive,this looks promising send specs,"[look, promising, send, spec]"
7,Ill need to check w/ my team,neutral,ill need to check w my team,"[ill, need, check, w, team]"
8,Were alredy using similar product,negative,were alredy using similar product,"[alredy, using, similar, product]"
9,Looking forward to demo!,positive,looking forward to demo,"[looking, forward, demo]"


In [17]:
#labelEncoder
from sklearn.preprocessing import LabelEncoder

# Encode to numbers
le = LabelEncoder()
data["label_encoded"] = le.fit_transform(data["label"])

print("\nClasses:", le.classes_)


Classes: ['negative' 'neutral' 'positive']


In [18]:
data.head()

Unnamed: 0,reply,label,clean_reply,tokens,label_encoded
0,Can we discuss pricing??,neutral,can we discuss pricing,"[discus, pricing]",1
1,"Im excited to explore this further, plz send c...",positive,im excited to explore this further plz send co...,"[im, excited, explore, plz, send, contract]",2
2,We not looking for new solutions.,negative,we not looking for new solutions,"[looking, new, solution]",0
3,Could u clarify features included?,neutral,could u clarify features included,"[could, u, clarify, feature, included]",1
4,"lets,, schedule a meeting to dive deeper",positive,lets schedule a meeting to dive deeper,"[let, schedule, meeting, dive, deeper]",2


In [19]:
data["text_for_vector"] = data["tokens"].apply(lambda x: " ".join(x))

In [20]:
data.head()

Unnamed: 0,reply,label,clean_reply,tokens,label_encoded,text_for_vector
0,Can we discuss pricing??,neutral,can we discuss pricing,"[discus, pricing]",1,discus pricing
1,"Im excited to explore this further, plz send c...",positive,im excited to explore this further plz send co...,"[im, excited, explore, plz, send, contract]",2,im excited explore plz send contract
2,We not looking for new solutions.,negative,we not looking for new solutions,"[looking, new, solution]",0,looking new solution
3,Could u clarify features included?,neutral,could u clarify features included,"[could, u, clarify, feature, included]",1,could u clarify feature included
4,"lets,, schedule a meeting to dive deeper",positive,lets schedule a meeting to dive deeper,"[let, schedule, meeting, dive, deeper]",2,let schedule meeting dive deeper


In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data["text_for_vector"])
y = data["label_encoded"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [22]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(1596, 186)
(1596,)
(533, 186)
(533,)


In [23]:
#Logistic regression
from sklearn.linear_model import LogisticRegression #Classification
from sklearn.metrics import accuracy_score, classification_report

In [24]:
clf = LogisticRegression()
clf.fit(X_train,y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [25]:
# Predict the Label
data.tail(2)

Unnamed: 0,reply,label,clean_reply,tokens,label_encoded,text_for_vector
2127,What exactly does your product do?,neutral,what exactly does your product do,"[exactly, product]",1,exactly product
2128,I am not the right person to contact.,negative,i am not the right person to contact,"[right, person, contact]",0,right person contact


In [26]:
# Test the trained model
print("Label:", clf.predict(vectorizer.transform(["I am not the right person to contact."])))
print("Label:", clf.predict(vectorizer.transform(["What exactly does your product do?"])))

Label: [0]
Label: [1]


In [27]:
y_pred = clf.predict(X_test)
y_pred

array([2, 0, 2, 1, 2, 2, 0, 1, 1, 0, 0, 1, 2, 0, 2, 2, 1, 1, 1, 1, 1, 2,
       1, 1, 1, 2, 2, 1, 2, 2, 2, 0, 2, 0, 1, 2, 0, 2, 2, 1, 1, 2, 1, 0,
       0, 2, 0, 2, 0, 0, 2, 0, 2, 1, 2, 0, 2, 1, 0, 0, 2, 2, 1, 1, 0, 2,
       0, 1, 1, 1, 0, 2, 2, 0, 0, 0, 1, 0, 1, 2, 2, 2, 0, 0, 0, 1, 1, 2,
       0, 2, 1, 1, 0, 0, 0, 0, 0, 2, 0, 0, 1, 2, 1, 2, 0, 0, 0, 0, 2, 0,
       2, 0, 1, 2, 1, 2, 1, 1, 0, 2, 0, 0, 1, 2, 1, 2, 1, 0, 2, 2, 0, 1,
       0, 2, 2, 2, 2, 1, 0, 1, 0, 1, 2, 2, 1, 1, 2, 0, 2, 1, 2, 2, 0, 0,
       0, 1, 2, 1, 1, 2, 0, 0, 1, 1, 0, 0, 2, 2, 1, 0, 1, 2, 0, 1, 0, 2,
       1, 0, 0, 0, 2, 0, 1, 1, 1, 2, 2, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 2,
       2, 0, 1, 2, 2, 2, 0, 0, 0, 1, 2, 0, 0, 0, 2, 1, 0, 0, 2, 1, 1, 0,
       1, 0, 1, 0, 2, 0, 2, 1, 1, 0, 2, 1, 1, 1, 1, 0, 0, 1, 2, 2, 2, 0,
       2, 2, 0, 2, 2, 2, 0, 2, 0, 1, 2, 1, 1, 0, 1, 0, 2, 0, 1, 1, 2, 2,
       1, 1, 1, 1, 0, 2, 2, 2, 1, 1, 1, 0, 0, 0, 0, 0, 0, 2, 2, 2, 0, 2,
       1, 0, 0, 1, 2, 1, 0, 2, 0, 0, 1, 1, 1, 1, 1,

In [28]:
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report

def metrics_score(clf,X_train,X_test,y_train,y_test,train=True):
    if train:
        y_pred=clf.predict(X_train)
        print("==================Training Score=================")
        print("Accuracy Score == >  ", accuracy_score(y_train,y_pred)*100)
    elif train==False:
        pred=clf.predict(X_test)
        print("=================Test Score======================")
        print("Accuracy Score==>  ",accuracy_score(y_test,pred)*100)
        print("Classification Report",classification_report(y_test,pred))

In [29]:
#checking train and test score
metrics_score(clf,X_train,X_test,y_train,y_test,train=True)
metrics_score(clf,X_train,X_test,y_train,y_test,train=False)

Accuracy Score == >   100.0
Accuracy Score==>   99.62476547842401
Classification Report               precision    recall  f1-score   support

           0       0.99      0.99      0.99       188
           1       1.00      0.99      1.00       168
           2       0.99      1.00      1.00       177

    accuracy                           1.00       533
   macro avg       1.00      1.00      1.00       533
weighted avg       1.00      1.00      1.00       533



In [38]:
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
import torch
from sklearn.metrics import accuracy_score
from datasets import Dataset # Import Dataset

# 1. Load tokenizer and model for 3 classes
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)

# Remove the original 'label' column as it's not needed for training
tokenized_datasets = tokenized_datasets.remove_columns(["label"])

# Rename the 'label_encoded' column to 'label' for the Trainer
tokenized_datasets = tokenized_datasets.rename_column("label_encoded", "label")


training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_steps=500,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    report_to="none" # Disable wandb logging
)

# 6. Compute metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    return {"accuracy": accuracy_score(labels, preds)}

# 7. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"], # Use the tokenized train dataset
    eval_dataset=tokenized_datasets["test"],   # Use the tokenized test dataset
    compute_metrics=compute_metrics,
    tokenizer=tokenizer # Pass the tokenizer
)

# 8. Train
trainer.train()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.017702,0.995305
2,No log,0.012978,0.995305
3,No log,0.01098,0.997653




TrainOutput(global_step=321, training_loss=0.10803158483772635, metrics={'train_runtime': 3654.0001, 'train_samples_per_second': 1.398, 'train_steps_per_second': 0.088, 'total_flos': 169197002267904.0, 'train_loss': 0.10803158483772635, 'epoch': 3.0})

In [40]:
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

# -----------------------------
#  Logistic Regression Metrics
# -----------------------------
def evaluate_logreg(clf, X_train, X_test, y_train, y_test):
    # Predictions
    y_train_pred = clf.predict(X_train)
    y_test_pred = clf.predict(X_test)

    # Metrics
    metrics = {
        "Train Accuracy": accuracy_score(y_train, y_train_pred),
        "Train F1": f1_score(y_train, y_train_pred, average="macro"),
        "Test Accuracy": accuracy_score(y_test, y_test_pred),
        "Test F1": f1_score(y_test, y_test_pred, average="macro")
    }
    return metrics

logreg_metrics = evaluate_logreg(clf, X_train, X_test, y_train, y_test)
print("=== Logistic Regression Metrics ===")
for k,v in logreg_metrics.items():
    print(f"{k}: {v:.4f}")

# -----------------------------
# DistilBERT Metrics
# -----------------------------
from datasets import Dataset
import torch

# Helper to get predictions from Hugging Face Trainer
def evaluate_transformer(trainer, tokenized_dataset):
    outputs = trainer.predict(tokenized_dataset)
    preds = np.argmax(outputs.predictions, axis=-1)
    labels = outputs.label_ids

    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="macro")
    return acc, f1

# Evaluate on train and test splits
acc_train, f1_train = evaluate_transformer(trainer, tokenized_datasets["train"])
acc_test, f1_test = evaluate_transformer(trainer, tokenized_datasets["test"])

print("\n=== DistilBERT Metrics ===")
print(f"Train Accuracy: {acc_train:.4f}, Train F1: {f1_train:.4f}")
print(f"Test Accuracy: {acc_test:.4f}, Test F1: {f1_test:.4f}")

# -----------------------------
# Compare side by side
# -----------------------------
import pandas as pd

comparison = pd.DataFrame({
    "Model": ["Logistic Regression", "DistilBERT"],
    "Train Accuracy": [logreg_metrics["Train Accuracy"], acc_train],
    "Train F1": [logreg_metrics["Train F1"], f1_train],
    "Test Accuracy": [logreg_metrics["Test Accuracy"], acc_test],
    "Test F1": [logreg_metrics["Test F1"], f1_test]
})

print("\n=== Model Comparison ===")
print(comparison)


=== Logistic Regression Metrics ===
Train Accuracy: 1.0000
Train F1: 1.0000
Test Accuracy: 0.9962
Test F1: 0.9963







=== DistilBERT Metrics ===
Train Accuracy: 1.0000, Train F1: 1.0000
Test Accuracy: 0.9977, Test F1: 0.9978

=== Model Comparison ===
                 Model  Train Accuracy  Train F1  Test Accuracy   Test F1
0  Logistic Regression             1.0       1.0       0.996248  0.996293
1           DistilBERT             1.0       1.0       0.997653  0.997755


In [42]:
import pickle
from transformers import DistilBertForSequenceClassification, DistilBertTokenizerFast

def save_models(logreg_model, transformer_model, transformer_tokenizer,
                logreg_path="logistic_regression_model.pkl",
                transformer_path="./distilbert_outsales_model"):


    # Save Logistic Regression
    with open(logreg_path, "wb") as f:
        pickle.dump(logreg_model, f)
    print(f"Logistic Regression saved to {logreg_path}")

    # Save DistilBERT model and tokenizer
    transformer_model.save_pretrained(transformer_path)
    transformer_tokenizer.save_pretrained(transformer_path)
    print(f"DistilBERT model and tokenizer saved to {transformer_path}")


save_models(
    logreg_model=clf,
    transformer_model=model,
    transformer_tokenizer=tokenizer,
    logreg_path="logreg_outsales.pkl",
    transformer_path="./distilbert_outsales"
)


Logistic Regression saved to logreg_outsales.pkl
DistilBERT model and tokenizer saved to ./distilbert_outsales


### 5) 
* Both models performed better based on Accuracy and f1 score.
* **Logistic Regression** is the simple Machine Algorithm, it performed well with best accuracy, it only requires cpu computation time, but for production, this takes manual work for preprocessing the dataset, if it is small dataset we can manage it, but when it comes to Larger dataset the text preprocess might complicated, because we might br lose some essential informations from the data and computation time might be higher and it also requires GPU for the model training. It is not suitable for the production.
  
* **Distilbert base uncased** model is a Hugging Face Transformer a pretrained model is used. This is suitable for the production, no matter how larger the dataset. It automatically preprocess the input since we load the pretrained model, everything process done in the backend so the manual work is not much needed, compare to logistic regression. Since it is a transformer we need GPU but execution time is fast. Also the training loss is very low, so it gives accurate prediction without getting overfitting problem.
    