#Importing the dependencies

In [174]:
import numpy as mp
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [175]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [176]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

#Data Collection and Pre-Processing

Loading the data from the csv file to a pandas dataframe

In [177]:
raw_mail_data = pd.read_csv('/content/mail_data.csv')

Printing first 5 data points as a sample

In [178]:
raw_mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


Checking the dimensions of dataset

In [161]:
raw_mail_data.shape

(5572, 2)

Replacing the null values with a null string

In [179]:
mail_data = raw_mail_data.where((pd.notnull(raw_mail_data)),'')

Label Encoding

- Labelling spam mail as 0
- Labelling ham mail as 1

In [180]:
mail_data.loc[mail_data['Category']== 'spam','Category',]=0

mail_data.loc[mail_data['Category']== 'ham','Category',]=1

Checking whether labels are correctly encoded or not

In [128]:
mail_data.head()

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


Seperating the data into text and labels

In [181]:
X = mail_data['Message']
Y = mail_data['Category']

In [182]:
print(X)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object


In [183]:
print(Y)

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: object


Stemming:

Stemming is the process of reducing a word to its Root Word

In [184]:
port_stem = PorterStemmer()

In [185]:

def stemming(content):
  stemmed_content = re.sub('[^a-zA-Z]',' ',content)
  stemmed_content = stemmed_content.lower()
  stemmed_content = stemmed_content.split()
  stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
  stemmed_content = ' '.join(stemmed_content)
  return stemmed_content

In [186]:
X = X.apply(stemming)

In [187]:
X.head()

0    go jurong point crazi avail bugi n great world...
1                                ok lar joke wif u oni
2    free entri wkli comp win fa cup final tkt st m...
3                  u dun say earli hor u c alreadi say
4                 nah think goe usf live around though
Name: Message, dtype: object

#Splitting data into train data and test data

In [188]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=3)

Checking how data is splitted

In [189]:
print(X.shape,X_train.shape,X_test.shape)

(5572,) (4457,) (1115,)


#Tranforming the text data to feature vectors that can be used as input to the Logistic regression

In [200]:
feature_extraction = TfidfVectorizer(min_df=1)
X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

Convert Y_train and Y_test values as integers

In [201]:
Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [202]:
print(X_train_features)

  (0, 3928)	0.7453404696134551
  (0, 3114)	0.5073237155975544
  (0, 2600)	0.43253928370575323
  (1, 2803)	0.16484914990688038
  (1, 2071)	0.2534627479935268
  (1, 2182)	0.24719223864006135
  (1, 1685)	0.2780012649696141
  (1, 167)	0.20816138236838677
  (1, 929)	0.1541510527691971
  (1, 2861)	0.24530975527811102
  (1, 3380)	0.17075875314921124
  (1, 1696)	0.29364542876489014
  (1, 1862)	0.6013599758872271
  (1, 1043)	0.30067998794361356
  (1, 1943)	0.22560043980505834
  (1, 2600)	0.156837747278195
  (2, 3257)	0.4580408031569388
  (2, 2043)	0.40276230538791946
  (2, 1376)	0.4720046829277976
  (2, 4893)	0.6365475059225227
  (3, 1790)	0.5844375752109064
  (3, 5607)	0.8114386733953441
  (4, 693)	0.14278681981742308
  (4, 3985)	0.2030382887378411
  (4, 704)	0.3169987527250405
  :	:
  (4455, 1438)	0.3343759341093223
  (4455, 4096)	0.3204701470174546
  (4455, 4596)	0.22188151274446055
  (4455, 5068)	0.28403482065051106
  (4455, 4478)	0.2207167011215539
  (4455, 145)	0.24912874691887366
  (4455

#Training the model

In [203]:
model = LogisticRegression()

Training the Logistic Regression model with training values

In [204]:
model.fit(X_train_features,Y_train)

#Model Evalution

prediction on training data

In [205]:
prediction_on_training_data = model.predict(X_train_features)
accuracy_on_training_data = accuracy_score(Y_train, prediction_on_training_data)

In [206]:
print("Accuracy on training data : ",accuracy_on_training_data)

Accuracy on training data :  0.97083239847431


prediction on test data

In [207]:
prediction_on_test_data = model.predict(X_test_features)
accuracy_on_test_data = accuracy_score(Y_test, prediction_on_test_data)

In [208]:
print("Accuracy on test data : ",accuracy_on_test_data)

Accuracy on test data :  0.967713004484305


#Building a Predictive System

In [215]:
input_mail = ["Free Free Free"]
# convert text to feature vectors

input_data_features = feature_extraction.transform(input_mail)

# making prediction

prediction = model.predict(input_data_features)
print(prediction)


if (prediction[0]==1):
  print('Ham mail')

else:
  print('Spam mail')

[0]
Spam mail
