In [115]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns 


from wordcloud import WordCloud
import nltk
from nltk.corpus import stopwords

In [116]:
nltk.download('stopwords')
nltk.download('punkt')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## 📄 Read CSV in Python using Pandas


In [117]:
df = pd.read_csv("spam.csv")
df.head(5)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [118]:
df.shape

(5572, 5)

In [119]:
df.columns

Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')

#### Now we going to delete some unnecessary columns

In [120]:
df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'],inplace=True)
df

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [121]:
df.rename(columns={'v1' : 'target', 'v2':'text'},inplace=True)
df

Unnamed: 0,target,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [122]:
df.isna().sum()

target    0
text      0
dtype: int64

In [123]:
df.duplicated().sum()

np.int64(403)

In [124]:
df.drop_duplicates(keep='first',inplace=True)

In [125]:
df.shape

(5169, 2)

### Date Preprocessing

In [126]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [127]:
df['target'] = le.fit_transform(df['target'])
df['target'].value_counts()

target
0    4516
1     653
Name: count, dtype: int64

### Feature Engineering

In [128]:
from nltk.stem.porter import PorterStemmer
import string


In [129]:
ps = PorterStemmer()

In [130]:

def transformation_text(text):
    text = text.lower()
    
    # Simple tokenization
    tokens = text.split()
    
    # Remove punctuation from each token and keep alphanumeric
    tokens = [''.join(c for c in word if c.isalnum()) for word in tokens]
    tokens = [word for word in tokens if word]  # Remove empty strings

    # Remove stopwords
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    
    # Stemming
    tokens = [ps.stem(word) for word in tokens]
    
    return " ".join(tokens)


In [131]:
df['transformed_text'] = df['text'].apply(transformation_text)
df.head()


Unnamed: 0,target,text,transformed_text
0,0,"Go until jurong point, crazy.. Available only ...",go jurong point crazi avail bugi n great world...
1,0,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entri 2 wkli comp win fa cup final tkt 21...
3,0,U dun say so early hor... U c already then say...,u dun say earli hor u c alreadi say
4,0,"Nah I don't think he goes to usf, he lives aro...",nah dont think goe usf live around though


In [132]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
tfid = TfidfVectorizer(max_features = 500)

In [133]:
X = tfid.fit_transform(df['transformed_text']).toarray()
y = df['target'].values


## Train and Test Split

In [134]:
from sklearn.model_selection import train_test_split
X_train, X_test , y_train, y_test = train_test_split(X,y,test_size = 0.20, random_state = 2)

In [135]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

In [136]:
svc = SVC(kernel= "sigmoid", gamma  = 1.0)
knc = KNeighborsClassifier()
mnb = MultinomialNB()
dtc = DecisionTreeClassifier(max_depth = 5)
lrc = LogisticRegression(solver = 'liblinear', penalty = 'l1')
rfc = RandomForestClassifier(n_estimators = 50, random_state = 2 )
abc = AdaBoostClassifier(n_estimators = 50, random_state = 2)
bc = BaggingClassifier(n_estimators = 50, random_state = 2)
etc = ExtraTreesClassifier(n_estimators = 50, random_state = 2)
gbdt = GradientBoostingClassifier(n_estimators = 50, random_state = 2)    
xgb  = XGBClassifier(n_estimators = 50, random_state = 2)


In [137]:
clfs = {
    'SVC': svc,
    'KNN': knc,
    'NB': mnb,
    'DT': dtc,
    'LR': lrc,
    'RF': rfc,
    'Adaboost': abc,
    'Bgc': bc,
    'ETC': etc,
    'GBDT': gbdt,
    'xgb': xgb
    
}

## Model Evaluation


In [138]:
from sklearn.metrics import accuracy_score, precision_score
def train_classifier(clfs, X_train, y_train, X_test, y_test):
    clfs.fit(X_train,y_train)
    y_pred = clfs.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    return accuracy , precision


In [139]:
accuracy_scores = []
precision_scores = []
for name , clfs in clfs.items():
    current_accuracy, current_precision = train_classifier(clfs, X_train, y_train, X_test, y_test)
    print()
    print("For: ", name)
    print("Accuracy: ", current_accuracy)
    print("Precision: ", current_precision)
    
    accuracy_scores.append(current_accuracy)
    precision_scores.append(current_precision)


For:  SVC
Accuracy:  0.9661508704061895
Precision:  0.9401709401709402

For:  KNN
Accuracy:  0.9284332688588007
Precision:  0.9848484848484849

For:  NB
Accuracy:  0.9690522243713733
Precision:  0.9649122807017544

For:  DT
Accuracy:  0.937137330754352
Precision:  0.9010989010989011

For:  LR
Accuracy:  0.9671179883945842
Precision:  0.956140350877193

For:  RF
Accuracy:  0.9700193423597679
Precision:  0.9652173913043478

For:  Adaboost
Accuracy:  0.9245647969052224
Precision:  0.875

For:  Bgc
Accuracy:  0.9593810444874274
Precision:  0.9

For:  ETC
Accuracy:  0.9709864603481625
Precision:  0.9285714285714286

For:  GBDT
Accuracy:  0.9468085106382979
Precision:  0.9368421052631579

For:  xgb
Accuracy:  0.9690522243713733
Precision:  0.9416666666666667
