## import stuff

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pickle


## read data

In [2]:
train_df = pd.read_csv("train.csv")
val_df = pd.read_csv("validation.csv")
test_df = pd.read_csv("test.csv")

X_train, y_train = train_df['text'], train_df['label']
X_val, y_val = val_df['text'], val_df['label']
X_test, y_test = test_df['text'], test_df['label']

## Convert words to vector

In [3]:
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_val_vec = vectorizer.transform(X_val)
X_test_vec = vectorizer.transform(X_test)

## Functions required

In [4]:
def fit_model(model, X_train, y_train):
    model.fit(X_train, y_train)
    return model

def score_model(model, X, y):
    return model.score(X, y)

def evaluate_model(model, X, y):
    y_pred = model.predict(X)
    metrics = {
        "accuracy": accuracy_score(y, y_pred),
        "precision": precision_score(y, y_pred, pos_label=1),
        "recall": recall_score(y, y_pred, pos_label=1),
        "f1_score": f1_score(y, y_pred, pos_label=1)
    }
    return metrics

def validate_model(model, X_train, y_train, X_val, y_val):
    model = fit_model(model, X_train, y_train)
    
    print("Train Scores:")
    train_metrics = evaluate_model(model, X_train, y_train)
    print(train_metrics)

    print("Validation Scores:")
    val_metrics = evaluate_model(model, X_val, y_val)
    print(val_metrics)

    return model


## Running on all the models

In [5]:
models = {
    "Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(max_iter=500),
    "Random Forest": RandomForestClassifier(n_estimators=100)
}


trained_models = {}
for name, model in models.items():
    print("\n---- Score For: "+ str(name)+ "----")
    trained_models[name] = validate_model(model, X_train_vec, y_train, X_val_vec, y_val)


---- Score For: Naive Bayes----
Train Scores:
{'accuracy': 0.9773950484391819, 'precision': 1.0, 'recall': 0.8405063291139241, 'f1_score': 0.9133425034387895}
Validation Scores:
{'accuracy': 0.9676956209619526, 'precision': 1.0, 'recall': 0.7413793103448276, 'f1_score': 0.8514851485148515}

---- Score For: Logistic Regression----
Train Scores:
{'accuracy': 0.9605310369573018, 'precision': 0.9863481228668942, 'recall': 0.7316455696202532, 'f1_score': 0.8401162790697675}
Validation Scores:
{'accuracy': 0.9619526202440776, 'precision': 0.935251798561151, 'recall': 0.7471264367816092, 'f1_score': 0.8306709265175719}

---- Score For: Random Forest----
Train Scores:
{'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1_score': 1.0}
Validation Scores:
{'accuracy': 0.9791816223977028, 'precision': 0.9865771812080537, 'recall': 0.8448275862068966, 'f1_score': 0.9102167182662538}


## Picking the best model

In [6]:
test_scores = {}
for name, model in trained_models.items():
    curr_score = score_model(model, X_test_vec, y_test)
    print(str(name)+"'s Score:", curr_score)
    test_scores[name]=curr_score
    
best_model_name = max(test_scores, key=test_scores.get)
best_model = trained_models[best_model_name]


print("The Best model out of the 3: ", best_model)


Naive Bayes's Score: 0.9713055954088953
Logistic Regression's Score: 0.9655667144906743
Random Forest's Score: 0.9770444763271162
The Best model out of the 3:  RandomForestClassifier()


## Testing on the best model

In [7]:
y_test_pred = best_model.predict(X_test_vec)

names_of_label={0:"Not Spam", 1: "Spam"}

num_samples = 5 
sample_indices = np.random.choice(len(X_test), num_samples, replace=False)

for idx in sample_indices:
    print(f"Text: {X_test.iloc[idx]}")
    print(f"Actual Label: {names_of_label[y_test.iloc[idx]]}, Predicted Label: {names_of_label[y_test_pred[idx]]}")
    print("-" * 80)

Text: Wrong phone ! phone ! answer one assume people n't well
Actual Label: Not Spam, Predicted Label: Not Spam
--------------------------------------------------------------------------------
Text: Haha , first person gon na ask
Actual Label: Not Spam, Predicted Label: Not Spam
--------------------------------------------------------------------------------
Text: come people
Actual Label: Not Spam, Predicted Label: Not Spam
--------------------------------------------------------------------------------
Text: Yup ok ...
Actual Label: Not Spam, Predicted Label: Not Spam
--------------------------------------------------------------------------------
Text: ‘ leave around four , ok ?
Actual Label: Not Spam, Predicted Label: Not Spam
--------------------------------------------------------------------------------
