In [41]:
!pip install pandas
!pip install nltk



In [42]:
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords

In [43]:
import chardet
with open('Dataset/spam.csv/spam.csv', 'rb') as f:
    result = chardet.detect(f.read())
    encoding = result['encoding']
df = pd.read_csv('Dataset/spam.csv/spam.csv', encoding=encoding)
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [44]:
#renaming the necessary columns for processing
df= df[['v2','v1']]
df.rename(columns={'v2':'messages','v1':'labels'},inplace=True)


In [45]:

df.head()

Unnamed: 0,messages,labels
0,"Go until jurong point, crazy.. Available only ...",ham
1,Ok lar... Joking wif u oni...,ham
2,Free entry in 2 a wkly comp to win FA Cup fina...,spam
3,U dun say so early hor... U c already then say...,ham
4,"Nah I don't think he goes to usf, he lives aro...",ham


In [46]:
#preprocessing 
df.isnull().sum()

messages    0
labels      0
dtype: int64

In [47]:
nltk.download('stopwords')
STOPWORDS=set(stopwords.words('english'))
def clean_text(text):
    text=text.lower()
    text=re.sub(r'^0-9a-zA-Z', ' ',text)
    text=re.sub(r'\s+',' ',text)
    text=" ".join(word for word in text.split() if word not in STOPWORDS)
    return text

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\taniy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [48]:
#clean the messages
df['clean_text']=df['messages'].apply(clean_text)
df.head()

Unnamed: 0,messages,labels,clean_text
0,"Go until jurong point, crazy.. Available only ...",ham,"go jurong point, crazy.. available bugis n gre..."
1,Ok lar... Joking wif u oni...,ham,ok lar... joking wif u oni...
2,Free entry in 2 a wkly comp to win FA Cup fina...,spam,free entry 2 wkly comp win fa cup final tkts 2...
3,U dun say so early hor... U c already then say...,ham,u dun say early hor... u c already say...
4,"Nah I don't think he goes to usf, he lives aro...",ham,"nah think goes usf, lives around though"


In [49]:
x=df['clean_text']
y=df['labels']

In [67]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer,TfidfTransformer

def classify(model,x,y):
    x_train,x_test,y_train,y_test= train_test_split(x,y,train_size=0.25,random_state=42,shuffle=True,stratify=y)
    pipeline_model=Pipeline([('vect',CountVectorizer()),
                             ('tfidf',TfidfTransformer()),
                             ('clf',model)])
    pipeline_model.fit(x_train,y_train)
    print('Accuracy:',pipeline_model.score(x_test,y_test)*100)

    y_pred= pipeline_model.predict(x_test)
    print(classification_report(y_test,y_pred))

In [68]:
from sklearn.linear_model import LogisticRegression
model=LogisticRegression()
classify(model,x,y)

Accuracy: 92.22301986121082
              precision    recall  f1-score   support

         ham       0.92      1.00      0.96      3619
        spam       0.98      0.43      0.60       560

    accuracy                           0.92      4179
   macro avg       0.95      0.71      0.78      4179
weighted avg       0.93      0.92      0.91      4179



In [69]:
 from sklearn.naive_bayes import MultinomialNB
model=MultinomialNB()
classify(model,x,y) 

Accuracy: 94.28092845178271
              precision    recall  f1-score   support

         ham       0.94      1.00      0.97      3619
        spam       1.00      0.57      0.73       560

    accuracy                           0.94      4179
   macro avg       0.97      0.79      0.85      4179
weighted avg       0.95      0.94      0.94      4179



In [71]:
from sklearn.svm import SVC
model=SVC()
classify(model,x,y)

Accuracy: 96.4584828906437
              precision    recall  f1-score   support

         ham       0.96      1.00      0.98      3619
        spam       0.99      0.74      0.85       560

    accuracy                           0.96      4179
   macro avg       0.98      0.87      0.91      4179
weighted avg       0.97      0.96      0.96      4179



In [72]:
from sklearn.ensemble import RandomForestClassifier
model=RandomForestClassifier()
classify(model,x,y)

Accuracy: 96.4584828906437
              precision    recall  f1-score   support

         ham       0.96      1.00      0.98      3619
        spam       1.00      0.74      0.85       560

    accuracy                           0.96      4179
   macro avg       0.98      0.87      0.91      4179
weighted avg       0.97      0.96      0.96      4179

