In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn import preprocessing
import tensorflow as tf

In [None]:
import nltk
nltk.download('punkt')
from nltk.stem.porter import PorterStemmer
import string

In [None]:
df = pd.read_csv("../DATA/spam.csv")

In [None]:
df.head()

In [None]:
df=df[['v1','v2']]
df.head()

In [None]:
df.shape

In [None]:
df.rename(columns={'v1':'target','v2':'text'},inplace=True)
df.head()

In [None]:
df.duplicated().sum()

In [None]:
df=df.drop_duplicates(keep='first')
df.duplicated().sum()

In [None]:
df['target'].value_counts()

In [None]:
import matplotlib.pyplot as plt
plt.pie(df['target'].value_counts(),labels=['ham','spam'],autopct="%0.2f")
plt.show()

In [None]:
#Label encoding
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()
df['target']=encoder.fit_transform(df['target'])
#ham=0,spam=1

In [None]:
from wordcloud import WordCloud

In [None]:
wc=WordCloud(width=500,height=500,min_font_size=10,background_color='white')
sarcastic_wc=wc.generate(df[df['target']==1]['text'].str.cat(sep=" "))
plt.figure(figsize=(16,8))
plt.imshow(sarcastic_wc)
print("Most used words in spam messages")

In [None]:
wc=WordCloud(width=500,height=500,min_font_size=10,background_color='white')
sarcastic_wc=wc.generate(df[df['target']==0]['text'].str.cat(sep=" "))
plt.figure(figsize=(16,8))
plt.imshow(sarcastic_wc)
print("Most used words in non spam messages")

In [None]:
from nltk.corpus import stopwords

In [None]:
ps=PorterStemmer()
def transform_text(text):
    text=text.lower()
    text=nltk.word_tokenize(text)
    a=[]
    for i in text:
        if i.isalnum():
            a.append(i)
    text=a[:]
    a.clear()
    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            a.append(i)
    text=a[:]
    a.clear()
    for i in text:
        a.append(ps.stem(i))
        
    return " ".join(a)

In [None]:
df['transformed_text']=df['text'].apply(transform_text)
df.head()

In [None]:
df.drop(columns='text', inplace=True)

In [None]:
df.head()

In [None]:
text = df["transformed_text"].tolist()

model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

embeddings = model(text)

embeddings[:2]

In [None]:
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score,f1_score,recall_score
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB

In [None]:

X = np.asarray(embeddings)
y = df["target"].tolist()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
svc = SVC(kernel='sigmoid', gamma=1.0)
knc = KNeighborsClassifier()
dtc = DecisionTreeClassifier(max_depth=5)
lrc = LogisticRegression(solver='liblinear', penalty='l1')
rfc = RandomForestClassifier(n_estimators=50, random_state=2)
abc = AdaBoostClassifier(n_estimators=50, random_state=2)
bc = BaggingClassifier(n_estimators=50, random_state=2)
etc = ExtraTreesClassifier(n_estimators=50, random_state=2)
gbdt = GradientBoostingClassifier(n_estimators=50,random_state=2)

In [None]:
clfs = {
    'SVC' : svc,
    'KN' : knc,  
    'DT': dtc, 
    'LR': lrc, 
    'RF': rfc, 
    'AdaBoost': abc, 
    'BgC': bc, 
    'ETC': etc,
    'GBDT':gbdt
}

In [None]:
def train_classifier(clf,X_train,y_train,X_test,y_test):
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test,y_pred)
    precision = precision_score(y_test,y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    return accuracy,precision,recall,f1

In [None]:
accuracy_scores = []
precision_scores = []
recall=[]
f1_scores=[]

for name,clf in clfs.items():
    
    current_accuracy,current_precision,current_recall,current_f1 = train_classifier(clf, X_train,y_train,X_test,y_test)
    
    print("\n\nFor ",name)
    print("Accuracy  - ",current_accuracy)
    print("Precision - ",current_precision)
    print('Recall    - ',current_recall)
    print('F1 Score  - ',current_f1)
    
    accuracy_scores.append(current_accuracy)
    precision_scores.append(current_precision)
    recall.append(current_recall)
    f1_scores.append(current_f1)

In [None]:
performance_df = pd.DataFrame({'Algorithm':clfs.keys(),'Accuracy':accuracy_scores,'Precision':precision_scores,'Recall':recall,'F1_score':f1_scores}).sort_values('Precision',ascending=False)
performance_df

In [None]:
from sklearn.model_selection import RandomisedSearchCV

In [None]:

param_grid = {'n_neighbors':range(1,31), 'weights': ['uniform','distance']}
grid_search = GridSearchCV(knc, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X, y)

In [None]:
print("Best parameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)