In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import joblib

# === STEP 1: Load CSV ===
df = pd.read_csv('/content/support_cases.csv')  # Replace with your CSV path

# Print available columns
print("Available columns:", df.columns.tolist())

# Try to detect the correct column names
text_col = None
label_col = None

# Guess text and label columns (you can hardcode them too if you know)
for col in df.columns:
    if 'text' in col.lower() or 'message' in col.lower() or 'query' in col.lower():
        text_col = col
    if 'label' in col.lower() or 'category' in col.lower() or 'type' in col.lower():
        label_col = col

# Raise errors if not found
if not text_col or not label_col:
    raise ValueError("Could not detect 'text' or 'category' columns. Please check your CSV column names.")

# === STEP 2: Preprocess ===
df = df[[text_col, label_col]].dropna()
df[text_col] = df[text_col].astype(str).str.lower()

# === STEP 3: Split dataset ===
X_train, X_test, y_train, y_test = train_test_split(df[text_col], df[label_col], test_size=0.2, random_state=42)

# === STEP 4: Build Pipeline ===
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('clf', LogisticRegression(max_iter=1000))
])

# === STEP 5: Train Model ===
pipeline.fit(X_train, y_train)

# === STEP 6: Evaluate Model ===
y_pred = pipeline.predict(X_test)
print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# === STEP 7: Save Model ===
joblib.dump(pipeline, 'support_case_classifier.pkl')

# === STEP 8: Predict New Cases ===
new_cases = [
    "I need help with my subscription payment.",
    "The software crashes when I try to export a file.",
    "Can you tell me your working hours?"
]

preds = pipeline.predict(new_cases)
print("\nPredictions:")
for case, pred in zip(new_cases, preds):
    print(f" - \"{case}\" → {pred}")


Available columns: ['message_length', 'response_time', 'case_type']

Accuracy: 0.3

Classification Report:
               precision    recall  f1-score   support

     billing       0.50      0.09      0.15        11
     general       0.29      1.00      0.45         5
   technical       0.00      0.00      0.00         4

    accuracy                           0.30        20
   macro avg       0.26      0.36      0.20        20
weighted avg       0.35      0.30      0.20        20


Predictions:
 - "I need help with my subscription payment." → general
 - "The software crashes when I try to export a file." → general
 - "Can you tell me your working hours?" → general
