# Model Building

## Import Libraries 

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('Data/spam.csv',encoding = "ISO-8859-1")
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [3]:
import nltk
df.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=1,inplace=True)
df.rename(columns={'v1':'Target','v2':'Text'},inplace=True)
df.drop_duplicates(keep='first',inplace=True)
from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder()
df['Target'] = enc.fit_transform(df['Target'])
df['Charachters'] = df['Text'].apply(len)
df['Words'] = df['Text'].apply(lambda x:len(nltk.word_tokenize(x)))
df['Sentences'] = df['Text'].apply(lambda x:len(nltk.sent_tokenize(x)))
from nltk.corpus import stopwords
import string
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
def preprocess(text):
    text = text.lower()
    text = nltk.word_tokenize(text)
    y = []
    
    for i in text:
        if i.isalnum():
            y.append(i)
            
    text = y[:]
    y.clear()
    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)
            
    text = y[:]
    y.clear()
    for i in text:
        y.append(ps.stem(i))
        
    return " ".join(y)



df['Processed'] = df['Text'].apply(preprocess)



## Convert text in Processed column into vectors

## Finding out whether TFIDF or Bag of words is better

#### As we know Naive Bayes gives best results or lets say its preffered in case of textual data

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
tf = TfidfVectorizer(max_features=3000)
cv = CountVectorizer()

In [5]:
X1 = tf.fit_transform(df['Processed']).toarray()
X2 = cv.fit_transform(df['Processed']).toarray()

In [6]:
y = df['Target']

In [7]:
from sklearn.model_selection import train_test_split
X1_train, X1_test, y_train, y_test = train_test_split(X1,y,test_size=0.2,random_state=42)
X2_train, X2_test, y_train, y_test = train_test_split(X2,y,test_size=0.2,random_state=42)

## Check which algorithms perform the best

In [10]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier,AdaBoostClassifier,BaggingClassifier,GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
from sklearn.metrics import accuracy_score,precision_score

random_state = 42

model1 = DecisionTreeClassifier(random_state=random_state,max_depth=5)
model2 = AdaBoostClassifier(DecisionTreeClassifier(criterion="entropy",max_depth=5,random_state=random_state))
model3 = RandomForestClassifier(n_estimators=50,random_state=random_state)
model4 = GradientBoostingClassifier(random_state=random_state)
model5 = LogisticRegression(random_state=random_state, solver='lbfgs', max_iter=10000)
model6 = XGBClassifier(random_state = random_state)
model7 = SVC(random_state=random_state)
model8 = BaggingClassifier(n_estimators=50,random_state=random_state)
model9 = GaussianNB()
model10 = BernoulliNB()
model11 = MultinomialNB()
model12 = ExtraTreesClassifier(n_estimators=50,random_state=random_state)

def make_classification(X_train,X_test,y_train,y_test):
    
    accuracy, precision = [],[]
    
    classifiers = []
    classifiers.append(model1)
    classifiers.append(model2)    
    classifiers.append(model3)    
    classifiers.append(model4)    
    classifiers.append(model5)    
    classifiers.append(model6)    
    classifiers.append(model7)    
    classifiers.append(model8)    
    classifiers.append(model9)    
    classifiers.append(model10)    
    classifiers.append(model11)    
    classifiers.append(model12)
    
    for classifier in classifiers:
        clf = classifier
        clf.fit(X_train,y_train)
        y_preds = clf.predict(X_test)
        
        accuracy.append(((accuracy_score(y_test, y_preds))) * 100)
        precision.append(((precision_score(y_test, y_preds))) * 100)
        
    results_df = pd.DataFrame({'Accuracy Score':accuracy,
                               'Precision Score':precision,
                               'Models': ['DT','ADB','RF','GBC','LR','XGB','SVC','Bagging','GNB','BNB','MNB','ETC']})
        
    results = (results_df.sort_values(by=['Precision Score','Accuracy Score'], ascending=False)
               .reset_index(drop=True))
        
    return classifiers,results

In [11]:
classifiers, results = make_classification(X1_train, X1_test, y_train, y_test)
results 

Unnamed: 0,Accuracy Score,Precision Score,Models
0,97.485493,99.173554,SVC
1,97.29207,99.159664,MNB
2,97.678917,97.637795,ETC
3,97.195358,97.540984,RF
4,96.22824,96.491228,GBC
5,97.29207,96.062992,BNB
6,95.841393,94.736842,LR
7,96.615087,94.354839,ADB
8,96.905222,92.481203,XGB
9,96.22824,88.405797,Bagging


In [12]:
classifiers, results = make_classification(X2_train, X2_test, y_train, y_test)
results

Unnamed: 0,Accuracy Score,Precision Score,Models
0,96.711799,100.0,RF
1,97.485493,99.173554,ETC
2,97.098646,98.319328,SVC
3,97.775629,97.65625,LR
4,96.615087,96.610169,BNB
5,92.45648,93.506494,DT
6,96.905222,92.481203,XGB
7,95.16441,89.915966,GBC
8,95.647969,89.0625,Bagging
9,97.388781,88.815789,MNB


## Lets see if voting classifier can give better combined result

In [18]:
svc = SVC(random_state=42,probability=True)
RF = RandomForestClassifier(n_estimators=50, random_state=42)
etc = ExtraTreesClassifier(n_estimators=50, random_state=42)

In [19]:
from sklearn.ensemble import VotingClassifier
voting = VotingClassifier(estimators=[('svm', svc), ('RF', RF), ('et', etc)],voting='soft')
voting.fit(X2_train,y_train)
y_pred = voting.predict(X2_test)
print("Accuracy",accuracy_score(y_test,y_pred))
print("Precision",precision_score(y_test,y_pred))

Accuracy 0.9787234042553191
Precision 1.0


## Lets try Stacking 

In [20]:
from sklearn.ensemble import StackingClassifier
clf = StackingClassifier(estimators=[('svm', svc), ('nb', mnb), ('et', etc)], final_estimator=RandomForestClassifier())

clf.fit(X2_train,y_train)
y_pred = clf.predict(X2_test)
print("Accuracy",accuracy_score(y_test,y_pred))
print("Precision",precision_score(y_test,y_pred))

Accuracy 0.9825918762088974
Precision 0.9568345323741008


In [21]:
import pickle 
pickle.dump(cv,open('Countvectorizer.pkl','wb'))
pickle.dump(voting,open('Votingmodel.pkl','wb'))
pickle.dump(RF,open('RFmodel.pkl','wb'))