### Import Libraries

In [None]:
#Import libraries
import numpy as np 
import pandas as pd 
import nltk
from nltk.corpus import stopwords
import string

In [None]:
import os
for dirname, _, filenames in os.walk('../input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
#read csv file & assign to a dataframe from https://www.kaggle.com/karthickveerakumar/spam-filter/version/1#emails.csv
df = pd.read_csv("../input/spam-filter/emails.csv")

In [None]:
#show the first 5 elements of the csv
df.head(10)

In [None]:
#print number of rows and columns
df.shape

In [None]:
df.info()

In [None]:
#show element n
print(df.iloc[100][0])


In [None]:
df.isnull().sum() #shows how many rows are null, this is good practice if we want to take care of null value in data prep

In [None]:
#In Python3, string.punctuation is a pre-initialized string used as string constant, this contain all the punctuation
punct= string.punctuation

In [None]:
punct

In [None]:
#nltk.download('stopwords')
#this package from natural language toolkit contains all the stopword in the major languages

In [None]:
stop_words = set(stopwords.words("italian"))

In [None]:
stop_words

### Clean the DataFrame

In [None]:
#now we remove all punctuations and stop words
df

In [None]:
def remove_punct(text):
    nopunct = [char for char in text if char not in punct]
    nopunct = ''.join(nopunct)
    return(nopunct)

#this function remove the punctuation
    

In [None]:
df1 = df['text'].apply(remove_punct)

In [None]:
df1

In [None]:
#now we remove all stop words
def remove_stop(text):
    nostop = [word for word in text.split() if word.lower() not in stop_words]
    return(nostop)

In [None]:
df2 = df1.apply(remove_stop)

In [None]:
df2 #at this point we have a list of tokens to analyze

In [None]:
def remove_everything(text):
    return remove_stop(remove_punct(text))

#this is definetely not the best coding - but is just to show the progression

In [None]:
df['text'].apply(remove_everything)

### Example of transforming a string into a countvector

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
#https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

In [None]:
message = 'KEA is a university, /////located in Nørrebro I love burger burger'
message2 = "are great or burger, burger"
message3 = 'hey i love berries and burger love burger a lot burger'
print(message)

In [None]:
vectorizer = CountVectorizer(analyzer=remove_everything)
#initialize a vector object


In [None]:
vect_fit_trans = vectorizer.fit_transform([message,message2,message3]) 
#run the function fit_transform over the array of messages

In [None]:
vectorizer.get_feature_names()
#shows all the features extracted - they are simply put in alphabetical order

In [None]:
print(vect_fit_trans)
#each word is transformed into a vector (x,y) n
#bag of words

### Transform the entire df

In [None]:
transformvc = CountVectorizer(analyzer=remove_everything)

In [None]:
FiTrannsformvc = transformvc.fit_transform(df['text'])

In [None]:
transformvc.get_feature_names()


### Split the Dataset

In [None]:
#now we split the dataset 80% for training and 20% for test
from sklearn.model_selection import train_test_split


In [None]:
X_train, X_test, y_train, y_test = train_test_split(FiTrannsformvc, df['spam'], test_size=0.20, random_state=75)

### Train the model

In [None]:
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

# I used Naive Bayes, being one of the most popular algoryth for spam detection
# https://www.quora.com/What-are-the-popular-ML-algorithms-for-email-spam-detection

In [None]:
X_train.shape #verify that train & test have the same shape, we fit_transform before the split, therefore it shouldn't be a problem

In [None]:
X_test.shape

In [None]:
print(classifier.predict(X_train))
print(y_train.values)

In [None]:
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score
pred = classifier.predict(X_train)
print(classification_report(y_train ,pred ))
print('Confusion Matrix: \n',confusion_matrix(y_train,pred))
print()
print('Accuracy: ', accuracy_score(y_train,pred))

In [None]:
import seaborn as sn
import matplotlib.pyplot as plt

In [None]:
#Print the predictions
print('Predicted value: ',classifier.predict(X_test))

#Print Actual Label
print('Actual value: ',y_test.values)

In [None]:
#here we evaluate the model on the test data set
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score
pred = classifier.predict(X_test)
print(classification_report(y_test ,pred ))


print('Confusion Matrix: \n', confusion_matrix(y_test,pred))
print()
print('Accuracy: ', accuracy_score(y_test,pred))

In [None]:
type(X_test)
X_test.shape


### try on a personal email

In [None]:
realmsg = '''
INSERT A MESSAGE HERE
'''
print(realmsg)

In [None]:
realvector = transformvc.transform([realmsg])

In [None]:
type(realvector)
realvector.shape


In [None]:
confidence = classifier.predict_proba(realvector)
prediction = classifier.predict(realvector)


In [None]:
print(confidence[0,1])

In [None]:
if confidence[0,1] > 0.8:
    print("This e-mail is Spam")
elif (confidence[0,1] > 0.5) and (confidence[0,1] < 0.8):
    print("This e-mail seems to be Spam, check it")
else:
    print("This e-mail is legit")

In [None]:
print(prediction)