In [1]:
import numpy
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline




In [2]:
from wordcloud import WordCloud
import nltk 
from nltk.corpus import stopwords


In [3]:
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/georgian70/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/georgian70/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
df = pd.read_csv('spam.csv')

In [5]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [6]:
df.drop(columns = ['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], inplace=True)

In [7]:
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
df.rename(columns = {'v1': 'target', 'v2': 'text'}, inplace = True)

In [9]:
df.head()

Unnamed: 0,target,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## DATA PREPROCESSING

In [10]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
df['target'] = encoder.fit_transform(df['target'])
df.head()

Unnamed: 0,target,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [11]:
df.duplicated().sum()
len(df)

5572

In [12]:
df = df.drop_duplicates(keep='first')
len(df)

5169

## FEATURE ENGINEERING

In [13]:
from nltk.stem.porter import PorterStemmer
import string
ps = PorterStemmer()

In [14]:
from nltk.tokenize import word_tokenize
def transform_text(text):
  text = text.lower()

  text = word_tokenize(text)
  res = []
  for str in text:
    if str.isalnum():
      res.append(str)

  text = res[:]
  res.clear()

  for str in text:
    if str not in stopwords.words('english') and str not in string.punctuation:
      res.append(str)
  
  text = res[:]
  res.clear()

  for str in text:
    res.append(ps.stem(str))
  print(res)

  return " ".join(res)



In [15]:
df['transform_text'] = df['text'].apply(transform_text)

['go', 'jurong', 'point', 'crazi', 'avail', 'bugi', 'n', 'great', 'world', 'la', 'e', 'buffet', 'cine', 'got', 'amor', 'wat']
['ok', 'lar', 'joke', 'wif', 'u', 'oni']
['free', 'entri', '2', 'wkli', 'comp', 'win', 'fa', 'cup', 'final', 'tkt', '21st', 'may', 'text', 'fa', '87121', 'receiv', 'entri', 'question', 'std', 'txt', 'rate', 'c', 'appli', '08452810075over18']
['u', 'dun', 'say', 'earli', 'hor', 'u', 'c', 'alreadi', 'say']
['nah', 'think', 'goe', 'usf', 'live', 'around', 'though']
['freemsg', 'hey', 'darl', '3', 'week', 'word', 'back', 'like', 'fun', 'still', 'tb', 'ok', 'xxx', 'std', 'chg', 'send', 'rcv']
['even', 'brother', 'like', 'speak', 'treat', 'like', 'aid', 'patent']
['per', 'request', 'mell', 'oru', 'minnaminungint', 'nurungu', 'vettam', 'set', 'callertun', 'caller', 'press', '9', 'copi', 'friend', 'callertun']
['winner', 'valu', 'network', 'custom', 'select', 'receivea', 'prize', 'reward', 'claim', 'call', 'claim', 'code', 'kl341', 'valid', '12', 'hour']
['mobil', '11',

In [16]:
df.head()

Unnamed: 0,target,text,transform_text
0,0,"Go until jurong point, crazy.. Available only ...",go jurong point crazi avail bugi n great world...
1,0,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entri 2 wkli comp win fa cup final tkt 21...
3,0,U dun say so early hor... U c already then say...,u dun say earli hor u c alreadi say
4,0,"Nah I don't think he goes to usf, he lives aro...",nah think goe usf live around though


In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfid = TfidfVectorizer(max_features=500)  

In [18]:
x = tfid.fit_transform(df['transform_text']).toarray()
y=df['target'].values

## Train Test Split

In [19]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2)

## Model Training

In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [21]:
svc=SVC(kernel='sigmoid', gamma=1.0)
knc=KNeighborsClassifier()
mnb=MultinomialNB()
dtc=DecisionTreeClassifier(max_depth=5)
lrc=LogisticRegression(solver='liblinear', penalty='l1')
rfc=RandomForestClassifier(n_estimators=50, random_state=2)
abc=AdaBoostClassifier(n_estimators=50, random_state=2)
bc=BaggingClassifier(n_estimators=50, random_state=2)
etc=ExtraTreesClassifier(n_estimators=50, random_state=2)
gbdt=GradientBoostingClassifier(n_estimators=50, random_state=2)

In [22]:
clfs = {
    'SVC': svc,
    'KNN': knc,
    'NB': mnb,
    'DT': dtc,
    'LR': lrc,
    'RF': rfc,
    'Adaboost': abc,
    'Bgc': bc,
    'ETC': etc,
    'GBDT': gbdt,
}

## MODEL EVALUATION

In [23]:
from sklearn.metrics import accuracy_score, precision_score

def train_model(clfs, x_train, x_test, y_train, y_test):
  clfs.fit(x_train, y_train)
  y_pred=clfs.predict(x_test)
  current_accuracy=accuracy_score(y_pred, y_test)
  current_precision=precision_score(y_pred, y_test)
  return current_accuracy, current_precision

In [24]:
accuracy_scores = []
precision_scores = []
for name, clfs in clfs.items():
  accuracy, precision = train_model(clfs,x_train,x_test,y_train,y_test)
  print(f"\nModel: {name}\tAccuracy: {accuracy}\tPrecision: {precision}\n")
  accuracy_scores.append(accuracy)
  precision_scores.append(precision)




Model: SVC	Accuracy: 0.9671179883945842	Precision: 0.8115942028985508


Model: KNN	Accuracy: 0.9274661508704062	Precision: 0.45652173913043476


Model: NB	Accuracy: 0.9709864603481625	Precision: 0.8115942028985508


Model: DT	Accuracy: 0.9390715667311412	Precision: 0.6014492753623188


Model: LR	Accuracy: 0.9632495164410058	Precision: 0.7536231884057971


Model: RF	Accuracy: 0.9700193423597679	Precision: 0.8260869565217391


Model: Adaboost	Accuracy: 0.9235976789168279	Precision: 0.5


Model: Bgc	Accuracy: 0.9622823984526112	Precision: 0.8043478260869565


Model: ETC	Accuracy: 0.9709864603481625	Precision: 0.855072463768116


Model: GBDT	Accuracy: 0.9497098646034816	Precision: 0.6739130434782609

