In [1]:
# General Libraries
import os
import pandas as pd
import joblib

# Machine Learning Libraries
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV

# Load datasets (using local paths)
data_dir = "./data_splits"  # Same directory where you saved splits
train_df = pd.read_csv(os.path.join(data_dir, "train.csv"))
val_df = pd.read_csv(os.path.join(data_dir, "validation.csv"))
test_df = pd.read_csv(os.path.join(data_dir, "test.csv"))

# Verify data
print(train_df.head())
print(f"\n✅ Data Loaded: Train ({len(train_df)}), Val ({len(val_df)}), Test ({len(test_df)})")

   label                                            message
0      0                                          guy close
1      0  please come imin towndontmatter urgoin outlrju...
2      0                          ok ksry knw sivatats askd
3      0                                ill see prolly yeah
4      0        ill see swing bit got thing take care firsg

✅ Data Loaded: Train (4457), Val (557), Test (558)


In [2]:
# Handle missing values
train_df['message'] = train_df['message'].fillna("")
val_df['message'] = val_df['message'].fillna("")
test_df['message'] = test_df['message'].fillna("")

# ====== 3. Initialize and Save Vectorizer ======
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train = vectorizer.fit_transform(train_df['message'])
X_val = vectorizer.transform(val_df['message'])
X_test = vectorizer.transform(test_df['message'])

y_train = train_df['label']
y_val = val_df['label']
y_test = test_df['label']

# Save vectorizer separately
os.makedirs("models", exist_ok=True)
joblib.dump(vectorizer, "models/tfidf_vectorizer.joblib")
print("✅ TF-IDF Vectorizer saved")

✅ TF-IDF Vectorizer saved


In [3]:
# ====== 4. Model Training ======
def train_models(X_train, y_train, X_val, y_val):
    models = {
        "Naive Bayes": MultinomialNB(),
        "Logistic Regression": LogisticRegression(max_iter=1000),
        "SVM": SVC(kernel='linear', probability=True)
    }
    
    best_score = -1
    best_model = None
    
    for name, model in models.items():
        model.fit(X_train, y_train)
        val_acc = accuracy_score(y_val, model.predict(X_val))
        print(f"{name}: Val Accuracy = {val_acc:.4f}")
        
        if val_acc > best_score:
            best_score = val_acc
            best_model = model
            best_name = name
    
    print(f"\n🏆 Best Model: {best_name} (Accuracy: {best_score:.4f})")
    return best_model, best_name

best_model, best_name = train_models(X_train, y_train, X_val, y_val)

Naive Bayes: Val Accuracy = 0.9659
Logistic Regression: Val Accuracy = 0.9605
SVM: Val Accuracy = 0.9838

🏆 Best Model: SVM (Accuracy: 0.9838)


In [4]:
# ====== 5. Hyperparameter Tuning ======
if best_name == "SVM":
    param_grid = {'C': [0.1, 1, 10]}
    grid_search = GridSearchCV(best_model, param_grid, cv=5)
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    print(f"Best params: {grid_search.best_params_}")

Best params: {'C': 10}


In [5]:
# ====== 6. Evaluation ======
test_preds = best_model.predict(X_test)
print("\n📊 Final Test Performance:")
print(classification_report(y_test, test_preds))
print(f"Test Accuracy: {accuracy_score(y_test, test_preds):.4f}")



📊 Final Test Performance:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98       483
           1       0.89      0.89      0.89        75

    accuracy                           0.97       558
   macro avg       0.94      0.94      0.94       558
weighted avg       0.97      0.97      0.97       558

Test Accuracy: 0.9713


In [6]:
# ====== 7. Save Model ======
joblib.dump(best_model, "models/best_model.joblib")
print("✅ Model saved at: models/best_model.joblib")


✅ Model saved at: models/best_model.joblib


In [None]:
# ====== 8. Verification ======
print("\n🔍 Verifying saved files:")
print(os.listdir("models"))


🔍 Verifying saved files:
['best_model.joblib', 'tfidf_vectorizer.joblib']


: 