In [6]:
# Install dependencies (if not already installed)
!pip install scikit-learn lightgbm joblib

import pandas as pd
import numpy as np
import os
import joblib

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.model_selection import train_test_split
import lightgbm as lgb

# ========================
# STEP 1: Load Dataset
# ========================
# Upload CSV manually or put in Google Drive
from google.colab import files
uploaded = files.upload()

DATA_PATH = list(uploaded.keys())[0]  # get uploaded file name
df = pd.read_csv(DATA_PATH)

print("Dataset preview:")
print(df.head())

# ========================
# STEP 2: Clean & Split
# ========================
df.dropna(subset=["reply"], inplace=True)
X = df["reply"].astype(str)
y = df["label"].astype(str)

# Convert labels to lowercase
y = y.str.lower()

# Print unique labels and their counts
print("\nLabel counts:")
print(y.value_counts())

# Remove labels with only one instance
label_counts = y.value_counts()
labels_to_remove = label_counts[label_counts < 2].index
df = df[~df["label"].isin(labels_to_remove)]

# Reassign X and y after removing rows with rare labels
X = df["reply"].astype(str)
y = df["label"].astype(str).str.lower()


print("Labels:", y.unique())

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


# ========================
# STEP 3: TF-IDF + Logistic Regression
# ========================
tfidf = TfidfVectorizer(stop_words="english", max_features=5000)
X_train_vec = tfidf.fit_transform(X_train)
X_test_vec = tfidf.transform(X_test)

logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train_vec, y_train)

y_pred = logreg.predict(X_test_vec)

print("\n=== Logistic Regression Performance ===")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Macro F1:", f1_score(y_test, y_pred, average="macro"))
print(classification_report(y_test, y_pred))

# Save artifacts
os.makedirs("models", exist_ok=True)
joblib.dump(tfidf, "models/tfidf_vectorizer.pkl")
joblib.dump(logreg, "models/logreg_model.pkl")

print("✅ Logistic Regression model & vectorizer saved to 'models/'")

# ========================
# STEP 4: LightGBM (Optional)
# ========================
lgbm = lgb.LGBMClassifier()
lgbm.fit(X_train_vec, y_train)

y_pred_lgbm = lgbm.predict(X_test_vec)

print("\n=== LightGBM Performance ===")
print("Accuracy:", accuracy_score(y_test, y_pred_lgbm))
print("Macro F1:", f1_score(y_test, y_pred_lgbm, average="macro"))
print(classification_report(y_test, y_pred_lgbm))

joblib.dump(lgbm, "models/lgbm_model.pkl")
print("✅ LightGBM model saved to 'models/'")



Saving reply_classification_dataset.csv to reply_classification_dataset (5).csv
Dataset preview:
                                               reply     label
0                           Can we discuss pricing??   NEUTRAL
1  Im excited to explore this further, plz send c...  POSITIVE
2                We not looking for new solutions.    negative
3                 Could u clarify features included?   neutral
4           lets,, schedule a meeting to dive deeper  positive

Label counts:
label
positive    710
negative    710
neutral     709
Name: count, dtype: int64
Labels: ['neutral' 'positive' 'negative']

=== Logistic Regression Performance ===
Accuracy: 0.9859154929577465
Macro F1: 0.9859310977785474
              precision    recall  f1-score   support

    negative       0.99      0.98      0.99       142
     neutral       0.99      0.99      0.99       142
    positive       0.97      0.99      0.98       142

    accuracy                           0.99       426
   macro avg     



In [7]:
from google.colab import files

files.download("models/tfidf_vectorizer.pkl")
files.download("models/logreg_model.pkl")
files.download("models/lgbm_model.pkl")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [9]:
import joblib

# Load saved models
tfidf = joblib.load("/content/models/tfidf_vectorizer.pkl")
logreg = joblib.load("/content/models/logreg_model.pkl")
lgbm = joblib.load("/content/models/lgbm_model.pkl")

# Example new reply
new_reply = ["Thanks, but we are not interested right now"]

# Transform input
X_new = tfidf.transform(new_reply)

# Predict
print("LogReg Prediction:", logreg.predict(X_new)[0])
print("LightGBM Prediction:", lgbm.predict(X_new)[0])


LogReg Prediction: negative
LightGBM Prediction: negative


