

Spam dataset:
https://www.kaggle.com/uciml/sms-spam-collection-dataset




###Loading the data

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import MinMaxScaler
from collections import Counter
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.feature_selection import VarianceThreshold
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import re
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
file_path = '/content/drive/My Drive/Data/spam.csv'
spam = pd.read_csv(file_path, encoding='ISO-8859-1')

In [None]:
spam

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


###Removing the null valued columns

In [None]:
nan_counts = spam.isnull().sum()
for column_name, nan_count in nan_counts.items():
    print(f"The number of NaN values in '{column_name}' is: {nan_count}")

The number of NaN values in 'v1' is: 0
The number of NaN values in 'v2' is: 0
The number of NaN values in 'Unnamed: 2' is: 5522
The number of NaN values in 'Unnamed: 3' is: 5560
The number of NaN values in 'Unnamed: 4' is: 5566


In [None]:
columns_to_drop = ['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4']
spam = spam.drop(columns=columns_to_drop)

In [None]:
new_column_names = {'v1': 'label', 'v2': 'message'}

spam = spam.rename(columns=new_column_names)

In [None]:
spam

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


###Labelling the Y Column

In [None]:
spam['label'] = spam['label'].map({'ham': 0, 'spam': 1})


In [None]:
spam

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will Ì_ b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


### Cleaning the text column

In [None]:
def clean_text(text):

    text = text.lower()
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    return text

spam['message'] = spam['message'].apply(clean_text)

In [None]:
spam

Unnamed: 0,label,message
0,0,go until jurong point crazy available only in ...
1,0,ok lar joking wif u oni
2,1,free entry in a wkly comp to win fa cup final...
3,0,u dun say so early hor u c already then say
4,0,nah i dont think he goes to usf he lives aroun...
...,...,...
5567,1,this is the nd time we have tried contact u u...
5568,0,will b going to esplanade fr home
5569,0,pity was in mood for that soany other suggest...
5570,0,the guy did some bitching but i acted like id ...


In [None]:
spam['raw_word_count'] = spam['message'].str.split().apply(len)

In [None]:
label_counts = spam['label'].value_counts()
print(label_counts)


0    4825
1     747
Name: label, dtype: int64


###Spliting the test and train

In [None]:
X = spam.drop('label', axis=1)
y = spam['label']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=68)


In [None]:
X_train

Unnamed: 0,message,raw_word_count
4225,ok thats cool its just off either raglan rd o...,24
1622,u have a secret admirer who is looking make c...,22
2071,good night my dear sleepwellamptake care,6
4821,u r a winner u ave been specially selected re...,23
4,nah i dont think he goes to usf he lives aroun...,13
...,...,...
2284,velly good yes please,4
4298,hurt me tease me make me cry but in the end of...,35
2980,wonders in my world th you th ur style th ur ...,29
5543,u still havent got urself a jacket ah,8


###Tokenization and Stemming

In [None]:
stemmer = PorterStemmer()

def preprocess_text(text):

    tokens = word_tokenize(text)
    tokens = [token for token in tokens if token.lower() not in stopwords.words('english')]
    stemmed = [stemmer.stem(token) for token in tokens]
    return ' '.join(stemmed)

X_train['processed_message'] = X_train['message'].apply(preprocess_text)
X_test['processed_message'] = X_test['message'].apply(preprocess_text)


In [None]:
X_train

Unnamed: 0,message,raw_word_count,processed_message
4225,ok thats cool its just off either raglan rd o...,24,ok that cool either raglan rd edward rd behind...
1622,u have a secret admirer who is looking make c...,22,u secret admir look make contact ufind rreveal...
2071,good night my dear sleepwellamptake care,6,good night dear sleepwellamptak care
4821,u r a winner u ave been specially selected re...,23,u r winner u ave special select receiv cash ho...
4,nah i dont think he goes to usf he lives aroun...,13,nah dont think goe usf live around though
...,...,...,...
2284,velly good yes please,4,velli good ye pleas
4298,hurt me tease me make me cry but in the end of...,35,hurt teas make cri end life die plz keep one r...
2980,wonders in my world th you th ur style th ur ...,29,wonder world th th ur style th ur smile th ur ...
5543,u still havent got urself a jacket ah,8,u still havent got urself jacket ah


In [None]:
X_train = X_train.drop('message', axis=1)
X_test = X_test.drop('message', axis=1)

In [None]:
X_train['new_word_count'] = X_train['processed_message'].str.split().apply(len)

In [None]:
X_test['new_word_count'] = X_test['processed_message'].str.split().apply(len)

In [None]:
X_train

Unnamed: 0,raw_word_count,processed_message,new_word_count
4225,24,ok that cool either raglan rd edward rd behind...,17
1622,22,u secret admir look make contact ufind rreveal...,11
2071,6,good night dear sleepwellamptak care,5
4821,23,u r winner u ave special select receiv cash ho...,17
4,13,nah dont think goe usf live around though,8
...,...,...,...
2284,4,velli good ye pleas,4
4298,35,hurt teas make cri end life die plz keep one r...,19
2980,29,wonder world th th ur style th ur smile th ur ...,25
5543,8,u still havent got urself jacket ah,7


###Vectorization

In [None]:
vectorizer = TfidfVectorizer()

Xtrain_tfidf = vectorizer.fit_transform(X_train['processed_message'])
Xtest_tfidf = vectorizer.transform(X_test['processed_message'])

Xtrain_tfidf_df = pd.DataFrame(Xtrain_tfidf.toarray(), columns=vectorizer.get_feature_names_out())
Xtest_tfidf_df = pd.DataFrame(Xtest_tfidf.toarray(), columns=vectorizer.get_feature_names_out())

Xtrain_tfidf_df['raw_word_count'] = X_train['raw_word_count'].values
Xtest_tfidf_df['raw_word_count'] = X_test['raw_word_count'].values

Xtrain_tfidf_df['new_word_count'] = X_train['new_word_count'].values
Xtest_tfidf_df['new_word_count'] = X_test['new_word_count'].values


In [None]:
Xtrain_tfidf_df

Unnamed: 0,aa,aah,aaniy,aaooooright,aathilov,aathiwher,ab,abdomen,abeg,aberdeen,...,zhong,zindgi,zoe,zogtoriu,zoom,zouk,zs,zyada,raw_word_count,new_word_count
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24,17
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22,11
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6,5
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,23,17
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4452,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4,4
4453,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,35,19
4454,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,29,25
4455,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8,7


In [None]:
Xtest_tfidf_df

Unnamed: 0,aa,aah,aaniy,aaooooright,aathilov,aathiwher,ab,abdomen,abeg,aberdeen,...,zhong,zindgi,zoe,zogtoriu,zoom,zouk,zs,zyada,raw_word_count,new_word_count
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5,4
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24,14
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9,7
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13,8
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24,15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1110,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9,4
1111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7,6
1112,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,32,23
1113,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,23,19


In [None]:
Xtrain_tfidf_df['stop_words_removed'] = Xtrain_tfidf_df['raw_word_count'] - Xtrain_tfidf_df['new_word_count']

Xtest_tfidf_df['stop_words_removed'] = Xtest_tfidf_df['raw_word_count'] - Xtest_tfidf_df['new_word_count']

###Balance the data

In [None]:
smote = SMOTE(random_state=42)

X_train_balanced, y_train_balanced = smote.fit_resample(Xtrain_tfidf_df, y_train)

In [None]:
X_train_balanced

Unnamed: 0,aa,aah,aaniy,aaooooright,aathilov,aathiwher,ab,abdomen,abeg,aberdeen,...,zindgi,zoe,zogtoriu,zoom,zouk,zs,zyada,raw_word_count,new_word_count,stop_words_removed
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24,17,7
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22,11,11
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6,5,1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,23,17,6
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13,8,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7687,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19,16,3
7688,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,31,18,12
7689,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,23,15,8
7690,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21,13,8


In [None]:
y_train_balanced

0       0
1       1
2       0
3       1
4       0
       ..
7687    1
7688    1
7689    1
7690    1
7691    1
Name: label, Length: 7692, dtype: int64

###Scaling the data using Min Max Scaler

In [None]:
scaler = MinMaxScaler()

X_train_scaled = scaler.fit_transform(X_train_balanced)

X_test_scaled = scaler.transform(Xtest_tfidf_df)

In [None]:
X_train_scaled_df = pd.DataFrame(X_train_scaled, index=X_train_balanced.index, columns=X_train_balanced.columns)

X_test_scaled_df = pd.DataFrame(X_test_scaled, index=Xtest_tfidf_df.index, columns=Xtest_tfidf_df.columns)

In [None]:
X_train_scaled_df

Unnamed: 0,aa,aah,aaniy,aaooooright,aathilov,aathiwher,ab,abdomen,abeg,aberdeen,...,zindgi,zoe,zogtoriu,zoom,zouk,zs,zyada,raw_word_count,new_word_count,stop_words_removed
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.140351,0.2125,0.086957
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.128655,0.1375,0.130435
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.035088,0.0625,0.021739
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.134503,0.2125,0.076087
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.076023,0.1000,0.065217
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7687,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,0.2000,0.043478
7688,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.181287,0.2250,0.141304
7689,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.134503,0.1875,0.097826
7690,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.122807,0.1625,0.097826


In [None]:
X_test_scaled_df

Unnamed: 0,aa,aah,aaniy,aaooooright,aathilov,aathiwher,ab,abdomen,abeg,aberdeen,...,zindgi,zoe,zogtoriu,zoom,zouk,zs,zyada,raw_word_count,new_word_count,stop_words_removed
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.029240,0.0500,0.021739
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.140351,0.1750,0.119565
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.052632,0.0875,0.032609
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.076023,0.1000,0.065217
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.140351,0.1875,0.108696
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1110,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.052632,0.0500,0.065217
1111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.040936,0.0750,0.021739
1112,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.187135,0.2875,0.108696
1113,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.134503,0.2375,0.054348


### SVM before feature selection

In [None]:
svm_classifier = SVC(kernel='linear')

svm_classifier.fit(X_train_scaled_df, y_train_balanced)

y_pred = svm_classifier.predict(X_test_scaled_df)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

[[973   6]
 [ 15 121]]
              precision    recall  f1-score   support

           0       0.98      0.99      0.99       979
           1       0.95      0.89      0.92       136

    accuracy                           0.98      1115
   macro avg       0.97      0.94      0.95      1115
weighted avg       0.98      0.98      0.98      1115

Accuracy: 0.9811659192825112


### Feature selection using Variance Threshold

In [None]:
sel = VarianceThreshold(threshold=0.01)
sel.fit(X_train_scaled_df)

# Transform both the training and test datasets
X_train_selected = sel.transform(X_train_scaled_df)
X_test_selected = sel.transform(X_test_scaled_df)

# Convert the arrays back to dataframes
selected_columns = X_train_scaled_df.columns[sel.get_support()]

X_train_selected_df = pd.DataFrame(X_train_selected, columns=selected_columns)
X_test_selected_df = pd.DataFrame(X_test_selected, columns=selected_columns)


In [None]:
X_train_selected_df

Unnamed: 0,appli,award,code,contact,expir,free,guarante,identifi,landlin,mobil,...,ppm,repli,servic,show,tc,text,tsc,txt,unredeem,urgent
0,0.0,0.00000,0.0,0.00000,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0
1,0.0,0.00000,0.0,0.62697,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0
2,0.0,0.00000,0.0,0.00000,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0
3,0.0,0.00000,0.0,0.00000,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0
4,0.0,0.00000,0.0,0.00000,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7687,0.0,0.33798,0.0,0.00000,0.0,0.212001,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.377518,0.0,0.0
7688,0.0,0.00000,0.0,0.00000,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.0,0.363025,0.0,0.000000,0.0,0.0
7689,0.0,0.00000,0.0,0.00000,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0
7690,0.0,0.00000,0.0,0.00000,0.0,0.244652,0.0,0.0,0.0,0.0,...,0.0,0.415811,0.492971,0.0,0.0,0.010512,0.0,0.000000,0.0,0.0


In [None]:
X_test_selected_df

Unnamed: 0,appli,award,code,contact,expir,free,guarante,identifi,landlin,mobil,...,ppm,repli,servic,show,tc,text,tsc,txt,unredeem,urgent
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.520351,0.0,0.0,0.000000,0.0,0.445592,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.379562,...,0.0,0.000000,0.548204,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1110,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0
1111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0
1112,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.745924,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0
1113,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.272690,...,0.0,0.000000,0.000000,0.0,0.0,0.262323,0.0,0.000000,0.0,0.0


### Model after Variance threshold feature selection

In [None]:
svm_classifier = SVC(kernel='linear')

svm_classifier.fit(X_train_selected_df, y_train_balanced)

y_pred1 = svm_classifier.predict(X_test_selected_df)

print(confusion_matrix(y_test, y_pred1))
print(classification_report(y_test, y_pred1))
print("Accuracy:", accuracy_score(y_test, y_pred1))

[[920  59]
 [ 27 109]]
              precision    recall  f1-score   support

           0       0.97      0.94      0.96       979
           1       0.65      0.80      0.72       136

    accuracy                           0.92      1115
   macro avg       0.81      0.87      0.84      1115
weighted avg       0.93      0.92      0.93      1115

Accuracy: 0.9228699551569507


### Feature selection using Chi-square

In [None]:
kbest = SelectKBest(chi2, k=500)


X_train_kbest = kbest.fit_transform(X_train_scaled_df, y_train_balanced)

# Transform the test data to select the same features as for the training data
X_test_kbest = kbest.transform(X_test_scaled_df)


selected_features = kbest.get_support(indices=True)
X_train_kbest_df = pd.DataFrame(X_train_kbest, columns=[X_train_scaled_df.columns[i] for i in selected_features])
X_test_kbest_df = pd.DataFrame(X_test_kbest, columns=[X_train_scaled_df.columns[i] for i in selected_features])


In [None]:
X_train_kbest_df

Unnamed: 0,ac,account,action,actual,admir,advis,ae,age,aight,alert,...,xma,xpwk,xxxxxxxxx,yeah,yer,your,yr,zed,raw_word_count,new_word_count
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.140351,0.2125
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.128655,0.1375
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.035088,0.0625
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.134503,0.2125
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.076023,0.1000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7687,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.085661,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,0.2000
7688,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.181287,0.2250
7689,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.134503,0.1875
7690,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.122807,0.1625


In [None]:
X_test_kbest_df

Unnamed: 0,ac,account,action,actual,admir,advis,ae,age,aight,alert,...,xma,xpwk,xxxxxxxxx,yeah,yer,your,yr,zed,raw_word_count,new_word_count
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.029240,0.0500
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.140351,0.1750
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.052632,0.0875
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.076023,0.1000
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.140351,0.1875
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1110,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.052632,0.0500
1111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.040936,0.0750
1112,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.187135,0.2875
1113,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.134503,0.2375


###Model after Chi-squared test feature selection

In [None]:
svm_classifier = SVC(kernel='linear')

svm_classifier.fit(X_train_kbest_df, y_train_balanced)

y_pred2 = svm_classifier.predict(X_test_kbest_df)

print(confusion_matrix(y_test, y_pred2))
print(classification_report(y_test, y_pred2))
print("Accuracy:", accuracy_score(y_test, y_pred2))

[[963  16]
 [ 11 125]]
              precision    recall  f1-score   support

           0       0.99      0.98      0.99       979
           1       0.89      0.92      0.90       136

    accuracy                           0.98      1115
   macro avg       0.94      0.95      0.94      1115
weighted avg       0.98      0.98      0.98      1115

Accuracy: 0.9757847533632287


###Conclusion

Model without any feature selection yeilded highest accuracy of 98.1 followed by model after Chisquared feature selection (97.5) and then followed by model after variance selection feature selection (92.2)