In [None]:

# Install dependencies
!pip install sentence-transformers scikit-learn pandas

# Import libraries
import pandas as pd
import re
import string
import joblib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sentence_transformers import SentenceTransformer

# Load data
df = pd.read_csv(r'data\realistic_complaints_dataset_9990_cleaned.csv')

# Clean text function
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"\d+", "", text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = re.sub(r"\s+", " ", text).strip()
    return text

df['cleaned'] = df['complaint_text'].apply(clean_text)

# Encode using Sentence Transformer
print("Encoding sentences...")
model_name = 'all-MiniLM-L6-v2'
embedder = SentenceTransformer(model_name)
X = embedder.encode(df['cleaned'].tolist(), convert_to_tensor=False)
y = df['category']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Train classifier
print("Training Logistic Regression...")
classifier = LogisticRegression(max_iter=1000, class_weight='balanced')
classifier.fit(X_train, y_train)

# Evaluate
preds = classifier.predict(X_test)
print("\nClassification Report:")
print(classification_report(y_test, preds))

# Save model and embedder
joblib.dump(classifier, "bert_complaint_classifier.pkl")
joblib.dump(embedder, "sentence_embedder.pkl")



[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Encoding sentences...
Training Logistic Regression...

Classification Report:
                  precision    recall  f1-score   support

   Account Issue       1.00      1.00      1.00       200
         App Bug       1.00      1.00      1.00       200
Customer Support       1.00      1.00      1.00       200
  Delivery Delay       1.00      1.00      1.00       200
   Login Trouble       1.00      1.00      1.00       200
   Payment Issue       1.00      1.00      1.00       200
  Product Damage       1.00      1.00      1.00       200
   Service Issue       1.00      1.00      1.00       200
 Technical Issue       1.00      1.00      1.00       200
      Wrong Item       1.00      1.00      1.00       200

        accuracy                           1.00      2000
       macro avg       1.00      1.00      1.00      2000
    weighted avg       1.00      1.00      1.00      2000



['sentence_embedder.pkl']

In [6]:
import pandas as pd
# Load data
df = pd.read_csv(r'C:\Users\HOME\Python-Jupyter\Streamlit projects\Final AI CCR\data\ecommerce_complaints_10000_cleaned.csv')
df.columns
df["category"].unique()

array(['Technical Issue', 'Payment Issue', 'Account Issue',
       'Delivery Delay', 'Service Issue', 'Product Damage',
       'Login Trouble', 'App Bug', 'Wrong Item', 'Customer Support'],
      dtype=object)

In [None]:
import joblib

# Load the trained classifier and sentence transformer
model = joblib.load("bert_complaint_classifier.pkl")
embedder = joblib.load("sentence_embedder.pkl")

# Clean text function (must match what you used during training)
import re
import string

def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\\S+|www\\S+", "", text)
    text = re.sub(r"\\d+", "", text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = re.sub(r"\\s+", " ", text).strip()
    return text

# Example: predict a new complaint
new_text = [""]
cleaned = [clean_text(text) for text in new_text]
embedded = embedder.encode(cleaned)
prediction = model.predict(embedded)

print("Predicted category:", prediction[0])


Predicted category: Payment Issue
