<a href="https://colab.research.google.com/github/Pakhi424/AI-Cold-Calling-Assistant/blob/main/Part_A_ML_Pipeline_%E2%80%93_Reply_Classification_(Baseline_%2B_Transformer).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
# Install necessary libraries
!pip install --quiet transformers datasets torch scikit-learn pandas

from google.colab import files
uploaded = files.upload()  # Upload reply_classification_dataset.csv here

import pandas as pd

# Assuming your dataset name is reply_classification_dataset.csv
df = pd.read_csv("reply_classification_dataset.csv")
print("Columns in dataset:", df.columns)
print(df.head())

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Split dataset
X = df['reply']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize text
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train model
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

# Predictions
y_pred = model.predict(X_test_tfidf)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

# Convert pandas df to HuggingFace Dataset
hf_dataset = Dataset.from_pandas(df)

# Tokenizer & Model
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(df['label'].unique()))

def tokenize(batch):
    return tokenizer(batch['reply'], padding=True, truncation=True)

hf_dataset = hf_dataset.map(tokenize, batched=True)

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=8,
    logging_steps=10,
    eval_strategy="no"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=hf_dataset
)

# Train (optional, can skip if you just want predictions)
# trainer.train()

# Step 1: Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Step 2: Load dataset
from google.colab import files
uploaded = files.upload()  # upload 'reply_classification_dataset.csv'

df = pd.read_csv('reply_classification_dataset.csv')
print("Columns in dataset:", df.columns)
print(df.head())

# Step 3: Preprocess data
df['reply'] = df['reply'].astype(str)
df['label'] = df['label'].str.lower()  # normalize labels

X = df['reply']
y = df['label']

# Step 4: Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Vectorize text
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Step 6: Train Logistic Regression
model = LogisticRegression(max_iter=1000)
model.fit(X_train_vec, y_train)

# Step 7: Predict & evaluate
y_pred = model.predict(X_test_vec)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Normalize labels to lowercase
df['label'] = df['label'].str.lower()

# Now split, vectorize, and train as before
X = df['reply']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

model = LogisticRegression(max_iter=1000)
model.fit(X_train_vec, y_train)

y_pred = model.predict(X_test_vec)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Saving reply_classification_dataset.csv to reply_classification_dataset (3).csv
Columns in dataset: Index(['reply', 'label'], dtype='object')
                                               reply     label
0                           Can we discuss pricing??   NEUTRAL
1  Im excited to explore this further, plz send c...  POSITIVE
2                We not looking for new solutions.    negative
3                 Could u clarify features included?   neutral
4           lets,, schedule a meeting to dive deeper  positive
Accuracy: 0.8427230046948356
              precision    recall  f1-score   support

    NEGATIVE       0.66      0.84      0.74        58
     NEUTRAL       0.00      0.00      0.00         1
    Negative       0.75      0.68      0.71        53
    POSITIVE       0.76      0.76      0.76        49
    negative       0.88      0.59      0.71        39
     neutral       0.99      1.00      0.99       135
    positive       0.86      0.87      0.86        91

    accuracy     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/2129 [00:00<?, ? examples/s]

Saving reply_classification_dataset.csv to reply_classification_dataset (4).csv
Columns in dataset: Index(['reply', 'label'], dtype='object')
                                               reply     label
0                           Can we discuss pricing??   NEUTRAL
1  Im excited to explore this further, plz send c...  POSITIVE
2                We not looking for new solutions.    negative
3                 Could u clarify features included?   neutral
4           lets,, schedule a meeting to dive deeper  positive
Accuracy: 0.9953051643192489

Classification Report:
               precision    recall  f1-score   support

    negative       1.00      0.99      0.99       150
     neutral       0.99      1.00      1.00       136
    positive       0.99      1.00      1.00       140

    accuracy                           1.00       426
   macro avg       1.00      1.00      1.00       426
weighted avg       1.00      1.00      1.00       426

Accuracy: 0.9953051643192489

Classification 