In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import string
from nltk.corpus import stopwords
import re
from nltk.stem.porter import PorterStemmer


from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB,GaussianNB,BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,BaggingClassifier,ExtraTreesClassifier,GradientBoostingClassifier
from xgboost import XGBClassifier

from sklearn.metrics import *
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from gensim.models import Word2Vec


In [2]:
df = pd.read_csv('mail_data.csv')
print(df.head())

  Category                                            Message
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...


In [3]:
print(f'The number of null values in dataset is {df.isnull().sum().sum()}')

The number of null values in dataset is 0


In [4]:
## NO NULL VALUES

## Data Preprocessing

In [5]:
print(f'The number of duplicate values in dataset is {df.duplicated().sum()}')

The number of duplicate values in dataset is 415


In [6]:
df.drop_duplicates(inplace=True)
print(f'The number of duplicate values in dataset is {df.duplicated().sum()}')

The number of duplicate values in dataset is 0


In [7]:
df.isnull().sum()

Category    0
Message     0
dtype: int64

In [8]:
#Observing starting lines
print(df['Message'][0])
print(df['Message'][1])
print(df['Message'][2])

Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
Ok lar... Joking wif u oni...
Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's


## Text Preprocessing

In [9]:
## Converting capital to small letters
df['Message'] = df['Message'].str.lower()
df.head()

Unnamed: 0,Category,Message
0,ham,"go until jurong point, crazy.. available only ..."
1,ham,ok lar... joking wif u oni...
2,spam,free entry in 2 a wkly comp to win fa cup fina...
3,ham,u dun say so early hor... u c already then say...
4,ham,"nah i don't think he goes to usf, he lives aro..."


In [10]:
df['Message'] = df['Message'].str.replace('#','')
df['Message'] = df['Message'].str.replace('@','')
# Remove urls
df['Message'] = df['Message'].str.replace(r'^https?:\/\/.*[\r\n]*','')

In [11]:
# Remove Punctuations
df['Message'] = df['Message'].str.translate(str.maketrans('','',string.punctuation))

df.head()

Unnamed: 0,Category,Message
0,ham,go until jurong point crazy available only in ...
1,ham,ok lar joking wif u oni
2,spam,free entry in 2 a wkly comp to win fa cup fina...
3,ham,u dun say so early hor u c already then say
4,ham,nah i dont think he goes to usf he lives aroun...


In [12]:
# Initialize stopwords
stop_words = set(stopwords.words('english'))

df['Message'] = df['Message'].apply(lambda x : ' '.join([word for word in x.split() if word not in (stop_words)]))

df.head()

Unnamed: 0,Category,Message
0,ham,go jurong point crazy available bugis n great ...
1,ham,ok lar joking wif u oni
2,spam,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,u dun say early hor u c already say
4,ham,nah dont think goes usf lives around though


In [13]:
# Handling ChatWords
chat_words = {
    "LOL": "laugh out loud",
    "BRB": "be right back",
    "OMG": "oh my god",
    "IDK": "i don't know",
    "BTW": "by the way"
}

def chat_conversion(text):
    new_text = []
    for word in text.split():
        if word.upper() in chat_words:
            new_text.append(chat_words[word.upper()])
        else:
            new_text.append(word)
    return' '.join(new_text)

df['Message'] = df['Message'].apply(chat_conversion)

df.head()

Unnamed: 0,Category,Message
0,ham,go jurong point crazy available bugis n great ...
1,ham,ok lar joking wif u oni
2,spam,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,u dun say early hor u c already say
4,ham,nah dont think goes usf lives around though


In [23]:
import nltk
import os
import nltk.data

# Set path to local nltk_data folder in project directory
nltk_data_path = os.path.join(os.getcwd(), 'nltk_data')

# Download punkt to the exact folder
nltk.download('punkt', download_dir=nltk_data_path)

# Add that folder to nltk’s data path
nltk.data.path.append(nltk_data_path)


[nltk_data] Downloading package punkt to
[nltk_data]     c:\Users\deeps\Desktop\Python_Project\Spam-Email-
[nltk_data]     Prediction\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
