# Spam Detection Using Naives Bayes Classification

## Imports

In [None]:
# import required libraries
import matplotlib.pyplot as plt # plotting
import nltk # for natural language processing tasks
import numpy as np # numerical computing library
import pandas as pd # for data visualization, preprocessing and wrangling
import seaborn as sns # for graphing and visualization

%matplotlib inline

In [None]:
# import various functions and classes of sklearn for our propose
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.utils.multiclass import unique_labels

## Load Data

In [None]:
# load training data
# data is obtained from kaggle
# kaggla dataset link: https://www.kaggle.com/uciml/sms-spam-collection-dataset
data = pd.read_csv('data/spam.csv')

data['target'] = np.where(data['target']=='spam',1, 0)
print('No of rows:', len(data))
data.head(10)

## Peek into Spam Messages and Non-spam Message

In [None]:
data[data['target'] == 0].sample(10)

In [None]:
data[data['target'] == 1].sample(10)

## Split Data into Training and Test Set

In [None]:
# splitting the dataset into training and test data
X_train, X_test, Y_train, Y_test = train_test_split(data['text'], 
                                                    data['target'], 
                                                    random_state=0)
print("Training data shape:", X_train.shape)
print("Testing data shape:", X_test.shape)

## Feature Extraction

In [None]:
# extract features
vectorizer = CountVectorizer(ngram_range=(1, 2)).fit(X_train)
X_train_vectorized = vectorizer.transform(X_train)
X_train_vectorized.toarray().shape

## Model Creation

In [None]:
# create Naive Bayes model
model = MultinomialNB(alpha=0.1)
model.fit(X_train_vectorized, Y_train)

## Predictions on Test Dataset

In [None]:
# predict y values for test dataset using the model we created
predictions = model.predict(vectorizer.transform(X_test))

## Evaluation

### Accuracy

In [None]:
# see accuracy in the testing set
print("Accuracy:", 100 * sum(predictions == Y_test) / len(predictions), '%')

## See performance on real life examples

In [None]:
# recent official emails
model.predict(vectorizer.transform(
    [
        "Dear ABC, Thank you very much for sharing these files with us. I will request your help if need be. Regards, XYZ",
        "Hi ABC, Thanks for putting together the note on admin code. It’s great first draft! It covers many important aspects I wanted to have a good understanding about!",
        "Dear ABC, Thanks for the kind reply. The paper seems very interesting, will discuss more when we meet. Referring to our previous conversation, it would be great to know about your work on mapping the administrative units. Furthermore, we would appreciate knowing about the depth of administrative level mapping and the methodology. I have copied ABC DEF (GHI) in our team who is working on a similar task. ABC can reflect on the technical perspectives as the conversation progress. We are looking forward to hearing from you and mutually benefit from the data if your convenience permits. Best regards, XYZ",
        "Hi ABC and DEF, A gentle reminder that we're very interested in hearing more about your comparisons between the HRSL dataset, WorldPop and other similar datasets. We're planning on doing a small desk review specific to Nepal when we have time and would appreciate the opportunity to start from where you all left off. We are of course more than happy to keep any unpublished research findings you share internal to the World Bank, and share back the results of our review.",
        "Dear all – here’s a room for us if you need it tomorrow morning. Best regards – XYZ",
    ])
            )              

In [None]:
# recent personal emails
model.predict(vectorizer.transform(
    [
        "Thank you, ABC. Can you also share your updated GitHub and LinkedIn profile? It helps to have personal/college projects in GitHub with proper documentation. As you are a fresher, employers would be willing to see your personal/college projects. Also, share a competitive programming profile if any.",
        "XYZ, That would be great! Please do let me know — I am an avid learner as you know and I LOVE attending lecture series and learning new things. Very much appreciate your offer. Thank you!! See you very soon indeed. -DEF-",
        "Hi ABC, I wish I was in Kathmandu so that we could have in-person discussion. However, will you be available for hangout call sometime next week? Let me know of your availability. We can talk more about your interest and future plans and discuss the options. -XYZ",
        "Hi y’all, Making quick introductions between python + QGIS Atlas lovers in Kathmandu. ABC, XYZ is looking at your code now and seems pretty comfortable with it. I told him he can write you with any questions — hope that’s OK. I’ll buy you some momos by way of thanks. Best, DEF",
        "Heyyy hiiiiii... Long time... Remember me? 😊😂 How are you? How's it going there...? What are you upto? :)",
    ])
            ) 

In [None]:
#  recent spam email in my official mail
# recent personal emails
model.predict(vectorizer.transform(
    [
        "get free discount in plane tickets",
        "free recharge card offer",
        "girls are waiting to chat with you",
        "1-month unlimited calls offer Activate now",
        "congratulation, you became today's lucky winner",
        "Jelie wants your phone number",
        
    ])
            ) 

In [None]:
# contrasts
model.predict(vectorizer.transform(
    [
        "Jelie wants your email",
        "can you please share your phone number?"
    ])
            ) 

In [None]:
ddddD