In [1]:
#mport libraries
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [4]:
# loading the dataset to a pandas DataFrame
df = pd.read_csv('mail_data.csv')

In [5]:
# print the first 5 rows of the dataframe
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
#streaming Function
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
def stre(dataset,col):
    corpus = []
    for i in range(0, len(dataset)):
        review = re.sub('[^a-zA-Z0-9]',' ', str(dataset[col][i]))
        review = review.lower()
        review = review.split()

        review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
        review = ' '.join(review)
        corpus.append(review)
    return corpus

In [21]:
#streaming apply on data
y=stre(df,"Message")
y=pd.DataFrame(y,columns=['mail'])

In [36]:
y

Unnamed: 0,mail
0,go jurong point crazi avail bugi n great world...
1,ok lar joke wif u oni
2,free entri 2 wkli comp win fa cup final tkt 21...
3,u dun say earli hor u c alreadi say
4,nah think goe usf live around though
...,...
5567,2nd time tri 2 contact u u 750 pound prize 2 c...
5568,b go esplanad fr home
5569,piti mood suggest
5570,guy bitch act like interest buy someth els nex...


In [37]:
# separating the data & label
X = y
Y = df['Category']

In [38]:
#data cleaning function
import string
def remove_sp(news):
    news=re.sub('\[.*?\]',' ',news)
    news = re.sub('@[^\s]+','',news)
    news = re.sub('http[^\s]+','',news)
    news=re.sub('\n',' ',news)
    news=re.sub('\w*\d\w*',' ',news)
    news=re.sub('[%s]'%re.escape(string.punctuation)," ",news)
    return news

In [43]:
#clean data
X = X.fillna('')
X['mail'] = X['mail'].apply(lambda x: remove_sp(x) if pd.notnull(x) else x)


In [46]:
X

Unnamed: 0,mail
0,go jurong point crazi avail bugi n great world...
1,ok lar joke wif u oni
2,free entri wkli comp win fa cup final tkt ...
3,u dun say earli hor u c alreadi say
4,nah think goe usf live around though
...,...
5567,time tri contact u u pound prize claim...
5568,b go esplanad fr home
5569,piti mood suggest
5570,guy bitch act like interest buy someth els nex...


In [48]:
#Apply and save weights of TfidfVectorizer
import joblib
tf=TfidfVectorizer()
tf_x=tf.fit_transform(X['mail']).toarray()
joblib.dump(tf, 'tfidf_vectorizer.pkl')
tf_x

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [49]:

#convert to pandas data frame
tf_x=pd.DataFrame(tf_x)
tf_x


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,6124,6125,6126,6127,6128,6129,6130,6131,6132,6133
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5568,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5569,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5570,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Label Encoding

In [84]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
Y = label_encoder.fit_transform(Y)

In [94]:
Y

array([1, 1, 0, ..., 1, 1, 1], dtype=int64)

In [86]:
#apply train test split
x_train,x_test,y_train,y_test=train_test_split(tf_x,Y,test_size=0.2)

In [88]:
#call model
ld=LogisticRegression(random_state=3)

In [90]:
#fit model on data
model=ld.fit(x_train,y_train)

In [91]:
joblib.dump(model,'nlppr.joblib')


['nlppr.joblib']

In [92]:
#Accuracy on validation data
pre=model.predict(x_test)
accuracy_score(y_test, pre)

0.9641255605381166

In [93]:
#cros validation score
from sklearn.model_selection import cross_val_score

#Your dataset is randomly divided into 5 roughly equal parts or "folds."

scores = cross_val_score(model,x_train,y_train, cv=5)
scores.mean()

0.9560251241865592

In [95]:
#python main.py -value "ong point, crazy.. Available only in bugis n great world la e buffe"