# THREE TYPES OF MODELS

Linear Regression (Regression)

K-Means Clustering (Clustering)

Multinomial Naive Bayes (Classification)


The data contains two columns,
    'text' = the message
    'label' = 1 if spam, 0 if not spam (ham)

## Libraries


In [None]:

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

#importing each model
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans, AgglomerativeClustering

# evaluation for each model
from sklearn.metrics import (
    #classification
    accuracy_score, precision_score, recall_score, f1_score,
    #clustering
    silhouette_score,
    v_measure_score)

# plot data
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report


## Helper Functions


### Dataset Loader and Processor

In [54]:
def load_data(filename):
    #loads the csv fule at filename e.g ("MergedDataCleaned.csv") 
    dataset = pd.read_csv(filename, encoding='latin-1')

    # Our data set has column name of text, spam. Renaming it to make it clearer for later 
    # waiitng for more data features to be added. so changing it helps
    if 'text' in dataset.columns and 'spam' in dataset.columns: # and 'vowel_count' in dataset.columns etc
        dataset = dataset.rename(columns={'spam': 'Spam', 'text': 'Message'})

    print("Dataset Overview:")

    print(f"Dataset shape: {dataset.shape}")
    print(f"Available features: {list(dataset.columns)}")
    print(f"Spam distribution:\n{dataset['Spam'].value_counts().to_string()}")
    print(f"Spam percentage: {dataset['Spam'].mean():.2%}")
    
    print("First 5 rows of dataset:")
    print(dataset.head())
    print("\nLast 5 rows of dataset:") 
    print(dataset.tail(), '\n')
    
    return dataset

### Train / Test Splitting
uses data processed by the load_data func

Uses the dataset, to split into training and testing datasets. 

Using the sklearn split function

In [55]:
def get_train_test_split(data): 
    X = data['Message']
    y = data['Spam']

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.25, random_state=42
    )
    return X_train, X_test, y_train, y_test

### Model pipelines

In [56]:
# Classification model pipelines
def create_logistic_regression_pipeline():
    return Pipeline([
        ('tfidf', TfidfVectorizer(stop_words='english')),
        ('lr', LogisticRegression(solver='liblinear'))
    ])


def create_multinomial_naive_bayes_pipeline():
    return Pipeline([
        ('tfidf', TfidfVectorizer(stop_words='english')),
        ('mnb', MultinomialNB())
    ])


# Cluster model Pipeline
def create_kmeans_pipeline():
    return Pipeline([
        ('count', CountVectorizer(stop_words='english')),
        ('km', KMeans(n_clusters=2, random_state=42))
    ])
    


### Logistic Regression Classification

In [57]:
def train_evaluate_lr(X_train, X_test, y_train, y_test):
    #Logistic Regression Pipleine Created
    clf = create_logistic_regression_pipeline()
    
    # Trained data using the Logistic Regression pipleline
    clf.fit(X_train, y_train)

    #predicts on test data 
    y_pred = clf.predict(X_test)

    # Evaluate predictions
    print('Logistic Regression Results')
    print('Accuracy:', accuracy_score(y_test, y_pred))
    print('Precision:', precision_score(y_test, y_pred))
    print('Recall:', recall_score(y_test, y_pred))
    print('F1 score:', f1_score(y_test, y_pred), '\n')
    
    # return clf, y_pred #if need to use later 

### Multinomial Naive Beyer Classification

In [58]:
def train_evaluate_mnb(X_train, X_test, y_train, y_test):
    #Multinomial Naive Beyer Pipleine Created
    clf = create_multinomial_naive_bayes_pipeline()
    
    # Trained data using the Multinomial Naive Beyer pipleline
    clf.fit(X_train, y_train)

    #predicts on test data 
    y_pred = clf.predict(X_test)

    # Evaluate predictions
    print('MultinomialNB Results')
    print('Accuracy:', accuracy_score(y_test, y_pred))
    print('Precision:', precision_score(y_test, y_pred))
    print('Recall:', recall_score(y_test, y_pred))
    print('F1 score:', f1_score(y_test, y_pred), '\n')
    
    # return clf, y_pred #if need to use later 

### K-Means Clustering

In [59]:
def train_evaluate_km(dataset):

    #K-Means Clustering Pipleine Created
    clf = create_kmeans_pipeline()
        
    # Trained data using the 'Message' column, as it uses unlabeled data for training
    clf.fit(dataset['Message'])

    #cluster label
    cluster_labels = clf.named_steps['km'].labels_ # accesess the cluster labels within K means pipeline
    
    X_vec = clf.named_steps['count'].transform(dataset['Message'])

    # Evaluate predictions
    print('K-Means Clustering Results')
    print("Silhouette Score:", silhouette_score(X_vec, cluster_labels))

    true_labels = dataset['Spam'] # shows the ground truth labels to compare predicted against 
    print("V-Measure Score:", v_measure_score(true_labels, cluster_labels), '\n')
    
    #return clf, dataset






## Final execution

In [60]:
data = load_data('MergedDataCleaned.csv')
X_train, X_test, y_train, y_test = get_train_test_split(data)

train_evaluate_lr(X_train, X_test, y_train, y_test)
train_evaluate_mnb(X_train, X_test, y_train, y_test)
train_evaluate_km(data)



Dataset Overview:
Dataset shape: (31671, 2)
Available features: ['Message', 'Spam']
Spam distribution:
Spam
0    17934
1    13737
Spam percentage: 43.37%
First 5 rows of dataset:
                                             Message  Spam
0  we know you would love to see all these wonder...     1
1  start date hourahead hour no ancillari schedul...     0
2  the deal are all in meter is on deal and meter...     0
3  per your phone messag the gri flag ha been cha...     0
4  o italiano marco carola e gaetano parisio enca...     1

Last 5 rows of dataset:
                                                 Message  Spam
31666  This is the 2nd time we have tried 2 contact u...     1
31667              Will Ã¼ b going to esplanade fr home?     0
31668  Pity, * was in mood for that. So...any other s...     0
31669  The guy did some bitching but I acted like i'd...     0
31670                         Rofl. Its true to its name     0 

Logistic Regression Results
Accuracy: 0.9710785551907047
Preci