# Importing the Dependencies

In [53]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import LabelEncoder

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.linear_model import LogisticRegression

# Importing Dataset

In [36]:
raw_data = pd.read_csv("./data/mail_data.csv")
raw_data

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [37]:
# checking the number of rows and columns in the dataframe
raw_data.shape

(5572, 2)

In [38]:
# Check for any empty categories:
raw_data.isnull().sum()

Category    0
Message     0
dtype: int64

In [39]:
raw_data['Category'].value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

The data is clearly IMBALANCED.

In [40]:
raw_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


So this contains a non numerical column "Category" which we need to convert to numerical values.

In [41]:
encoder = LabelEncoder()
raw_data['Category_encoded'] = encoder.fit_transform(raw_data['Category'])

raw_data.head()

Unnamed: 0,Category,Message,Category_encoded
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [42]:
# Remove the Original Category Column
raw_data.drop('Category', axis=1, inplace=True)

In [43]:
raw_data.head()

Unnamed: 0,Message,Category_encoded
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [44]:
# Seperate into X and y
X = raw_data['Message']
y = raw_data['Category_encoded']

Since we have Message as a string, we need to convert it into a numerical representation. We will use CountVectorizer for this purpose.

In [45]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(raw_data['Message'])
print(X)

  (0, 3567)	1
  (0, 8080)	1
  (0, 4370)	1
  (0, 5954)	1
  (0, 2334)	1
  (0, 1313)	1
  (0, 5567)	1
  (0, 4110)	1
  (0, 1763)	1
  (0, 3651)	1
  (0, 8544)	1
  (0, 4497)	1
  (0, 1761)	1
  (0, 2057)	1
  (0, 7690)	1
  (0, 3611)	1
  (0, 1079)	1
  (0, 8320)	1
  (1, 5534)	1
  (1, 4533)	1
  (1, 4338)	1
  (1, 8446)	1
  (1, 5563)	1
  (2, 4110)	1
  (2, 3369)	1
  :	:
  (5570, 4241)	1
  (5570, 8367)	1
  (5570, 1094)	1
  (5570, 4638)	1
  (5570, 7085)	1
  (5570, 3319)	1
  (5570, 7670)	1
  (5570, 1447)	1
  (5570, 5363)	1
  (5570, 2602)	1
  (5570, 8116)	1
  (5570, 1790)	1
  (5570, 7095)	1
  (5570, 2901)	1
  (5570, 3485)	1
  (5570, 1798)	1
  (5570, 3705)	1
  (5570, 4184)	1
  (5570, 911)	1
  (5570, 1557)	1
  (5571, 7802)	1
  (5571, 5272)	1
  (5571, 4249)	2
  (5571, 7934)	1
  (5571, 6544)	1


# Pre-Processing Function:

In [107]:
def preprocess_input(input_text):
    # Transform the input text using the fitted vectorizer
    input_vectorized = vectorizer.transform(input_text)
    return input_vectorized

# Split into train and test sets

In [51]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,  random_state=42)
print(X.shape)
print(X_train.shape)
print(X_test.shape)

(5572, 8709)
(4457, 8709)
(1115, 8709)


# Train the model

In [54]:
model = LogisticRegression()
model.fit(X_train, y_train)

# Evaluate the model

In [97]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import cross_val_score

def evaluate_classification(model, X, y, cv=5, scoring='accuracy'):
    """
    Function to evaluate classification model performance using cross-validation.
    
    Arguments:
    model -- scikit-learn classifier object.
    X -- numpy array or DataFrame, features.
    y -- numpy array or Series, labels.
    cv -- int, number of cross-validation folds (default: 5).
    scoring -- string, evaluation metric to be used (default: 'accuracy').
    
    Returns:
    A dictionary containing the evaluation metrics:
    - Accuracy (cross-validation and overall)
    - Precision (cross-validation and overall)
    - Recall (cross-validation and overall)
    - F1-Score (cross-validation and overall)
    - ROC AUC (cross-validation and overall)
    """
    
    metrics = {}
    
    # Perform cross-validation and calculate evaluation metrics for each fold
    metrics['Accuracy_CV'] = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
    metrics['Precision_CV'] = cross_val_score(model, X, y, cv=cv, scoring='precision')
    metrics['Recall_CV'] = cross_val_score(model, X, y, cv=cv, scoring='recall')
    metrics['F1-Score_CV'] = cross_val_score(model, X, y, cv=cv, scoring='f1')
    metrics['ROC AUC_CV'] = cross_val_score(model, X, y, cv=cv, scoring='roc_auc')
    
    # Calculate the mean of evaluation metrics from cross-validation
    for metric, values in metrics.items():
        metrics[metric] = values.mean()
    
    # Calculate evaluation metrics without cross-validation
    y_pred = model.predict(X)
    y_prob = model.predict_proba(X)[:, 1]
    metrics['Accuracy'] = accuracy_score(y, y_pred)
    metrics['Precision'] = precision_score(y, y_pred)
    metrics['Recall'] = recall_score(y, y_pred)
    metrics['F1-Score'] = f1_score(y, y_pred)
    metrics['ROC AUC'] = roc_auc_score(y, y_prob)
    
    return metrics


In [98]:
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)

In [99]:
evaluate_classification(model, X_test, y_test)

{'Accuracy_CV': 0.9686098654708521,
 'Precision_CV': 0.9675244755244755,
 'Recall_CV': 0.7914942528735632,
 'F1-Score_CV': 0.8699790997016305,
 'ROC AUC_CV': 0.9798367927835665,
 'Accuracy': 0.9865470852017937,
 'Precision': 1.0,
 'Recall': 0.8993288590604027,
 'F1-Score': 0.9469964664310955,
 'ROC AUC': 0.9895855044673253}

# Checking on Fresh Data

In [110]:
def predict_spam(input_text):
    # Preprocess the input text
    input_vectorized = preprocess_input(input_text)
    
    # Make predictions using the trained model
    predictions = model.predict(input_vectorized)
    
    # Decode the predictions back to the original categories
    predicted_categories = encoder.inverse_transform(predictions)
    
    return predicted_categories

In [111]:
input_mail = ["I've been searching for the right words to thank you for this breather. I promise I won't take your help for granted and will fulfill my promise. You have been wonderful and a blessing at all times"]

prediction = predict_spam(input_mail)
print(prediction)

['ham']
