In [126]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer # used to convert a collection of text documents to a matrix of token counts.
from sklearn.model_selection import train_test_split, GridSearchCV, KFold # splitting data, hyperparameter tuning & cross-validation respectively.
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE # handling class imbalance by oversampling the minority class.
from sklearn.ensemble import RandomForestClassifier

In [127]:
# import pip
# pip.main(['install', '--user', 'imbalanced-learn'])

In [128]:
data = pd.read_csv("spam.csv", encoding='ISO-8859-1')
data

Unnamed: 0,Label,EmailText,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [129]:
# let seperate data into dependent and independent data set
x = data["EmailText"] # features
y = data["Label"] # target

### *CountVectorizer*

In [130]:
from sklearn.feature_extraction.text import CountVectorizer

cvec_0 = CountVectorizer()

L = ["Mother", "Father", "Father is kind like Mother", "Farmer", "My father is a good friend of that farmer"]
cx_0 = cvec_0.fit_transform(L)
cx_0.toarray()

# Row ---> Sentence
# Column ---> Word
# Short each word by alphabetic order
# like 'a' 'the' article words will remove
# last row ---> farmer, father, freind, good, is, 0(kind), 0(like), 0(mother), my, of, that

array([[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1]], dtype=int64)

### Creating a matrix with frequencies of email texts

In [131]:
cvec = CountVectorizer()
cx = cvec.fit_transform(x) # cx is a matrix

In [132]:
cx.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [133]:
cx.shape # sentences = 5572, words = 8672

(5572, 8672)

In [134]:
y.value_counts() # spams are very low, not good for model traning(classification, prediction bias for majority class use SMOTE), 
                 # class imbalance are there

ham     4825
spam     747
Name: Label, dtype: int64

### SMOTE for class imbalance

In [135]:
smt = SMOTE()
x_sm, y_sm = smt.fit_resample(cx, y) # (matrix, y data frame)
print(y_sm.value_counts()) # Now classes are balanced

ham     4825
spam    4825
Name: Label, dtype: int64


In [136]:
x_train, x_test, y_train, y_test = train_test_split(x_sm, y_sm, test_size=0.2, random_state=0)

# *Let do with Support Vector Machine with cross validation*
### *Grid search & KFold for identifying best hyperparameters*

In [137]:
params_svc  = {"kernel" : ["rbf", "linear"]}
cv = KFold(n_splits=5)

# KFold is a cross-validator that provides train/test indices to split data into n_splits folds.
# n_splits=5 means that the data will be split into 5 folds.
# In each iteration of the cross-validation, 4 folds will be used for training, and 1 fold will be used for testing. 
# This process repeats 5 times, with each fold being used once as the test fold.

In [138]:
model = GridSearchCV(SVC(), params_svc, cv=cv) # cv=cv specifies that K-Fold cross-validation should be used to evaluate each combination 
                                               # of parameters.

In [139]:
model.fit(x_train, y_train)
print(model.best_params_)

{'kernel': 'linear'}


### *model with best hyperparameter for SVC*

In [140]:
bmodel = SVC(kernel="linear")
bmodel.fit(x_train, y_train)
y_pred_bmodel = bmodel.predict(x_test)
print(y_pred_bmodel)
print(accuracy_score(y_test, y_pred_bmodel))

['ham' 'spam' 'spam' ... 'spam' 'spam' 'spam']
0.9518134715025907


In [141]:
emails = ["Hey you won a car !!!. Congratzz",
          "Dear applicant, your CV has been recieved. Best regards",
          "You have received $1000000 to your account",
          "Join with WhatsApp group",
          "Kindly check the previous email. Kind regard"]

yp = bmodel.predict(cvec.transform(emails))
yp # more accurate than RFC
# s, h, s, s, h

array(['spam', 'ham', 'spam', 'spam', 'spam'], dtype=object)

### *Done with Random Forest Classifier model with cross validation*

In [118]:
# Define the parameter grid for Random Forest
param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Set up K-Fold cross-validation
cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform grid search with cross-validation
rfc = RandomForestClassifier()
grid_search = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=cv, scoring='accuracy', n_jobs=-1)
grid_search.fit(x_train, y_train)

# Print the best parameters found during the grid search
print("Best parameters found:", grid_search.best_params_)

# Train the Random Forest model with the best parameters
best_rfc = grid_search.best_estimator_
best_rfc.fit(x_train, y_train)

# Predict on the test set
y_pred = best_rfc.predict(x_test)

# Calculate and print the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Random Forest Accuracy:", accuracy)

Best parameters found: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 100}
Random Forest Accuracy: 0.9621761658031088


### Done with Random Forest Classifier model without cross validation

In [142]:
emails = ["Hey you won a car !!!. Congratzz",
          "Dear applicant, your CV has been recieved. Best regards",
          "You have received $1000000 to your account",
          "Join with WhatsApp group",
          "Kindly check the previous email. Kind regard"]

yp = best_rfc.predict(cvec.transform(emails))
yp # s, h, s, s, h

array(['spam', 'ham', 'spam', 'spam', 'spam'], dtype=object)

In [143]:
rfc_model = RandomForestClassifier(n_estimators = 500)

### Model training

In [144]:
rfc_model.fit(x_train, y_train)

RandomForestClassifier(n_estimators=500)

### Prediction and accuracy of the model

In [145]:
y_pred = rfc_model.predict(x_test)

In [146]:
y_pred

array(['ham', 'spam', 'spam', ..., 'spam', 'spam', 'spam'], dtype=object)

In [147]:
y_test

1070     ham
4488     ham
8763    spam
7372    spam
7633    spam
        ... 
212      ham
4546     ham
6411    spam
7916    spam
6712    spam
Name: Label, Length: 1930, dtype: object

In [148]:
accuracy_score(y_test, y_pred)

0.9569948186528497

### Let check model with new samples

In [149]:
emails = ["Hey you won a car !!!. Congratzz",
          "Dear applicant, your CV has been recieved. Best regards",
          "You have received $1000000 to your account",
          "Join with WhatsApp group",
          "Kindly check the previous email. Kind regard"]

In [150]:
yp = rfc_model.predict(cvec.transform(emails))

In [151]:
yp # s, h, s, s, h

array(['spam', 'ham', 'spam', 'spam', 'spam'], dtype=object)