In [None]:
import pandas as pd
import pickle
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# ==========================================
# STEP 1: Load & Prepare Data
# ==========================================
csv_path = "../dataset/filtered_public_utility_complaints.csv"

print(f"Loading dataset from: {csv_path}")
try:
    df = pd.read_csv(csv_path)
    print(f"Dataset loaded successfully. Shape: {df.shape}")
except FileNotFoundError:
    print(f"Error: File not found at {csv_path}")
    # Stop execution if file not found (in a notebook, we can just raise or exit)
    raise

# Define Input and Output
if 'Consumer complaint narrative' not in df.columns or 'category' not in df.columns:
    raise ValueError("Dataset missing required columns: 'Consumer complaint narrative' or 'category'")

X = df['Consumer complaint narrative'].astype(str) # Ensure all are strings
y = df['category']

print(f"Input samples: {len(X)}")
print(f"Target distribution:\n{y.value_counts()}")

# ==========================================
# STEP 2: Feature Extraction
# ==========================================
print("\nPerforming TF-IDF Vectorization...")
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_vectorized = vectorizer.fit_transform(X)
print(f"Vectorization complete. Shape: {X_vectorized.shape}")

# ==========================================
# STEP 3: Model Training
# ==========================================
print("\nSplitting data into Train/Test (80/20)...")
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=42)

print("Training Logistic Regression model...")
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
print("Training complete.")

# ==========================================
# STEP 4: Model Evaluation
# ==========================================
print("\nEvaluating model...")
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")

# ==========================================
# STEP 5: Save Model Files
# ==========================================
# Ensure the target directory exists
model_dir = "../backend/app/models"
os.makedirs(model_dir, exist_ok=True)

model_path = os.path.join(model_dir, "model.pkl")
vectorizer_path = os.path.join(model_dir, "vectorizer.pkl")

print(f"\nSaving model to: {model_path}")
with open(model_path, "wb") as f:
    pickle.dump(model, f)

print(f"Saving vectorizer to: {vectorizer_path}")
with open(vectorizer_path, "wb") as f:
    pickle.dump(vectorizer, f)

print("\nAll tasks completed successfully.")