### Email spam filtering

### Load Data

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('sms.tsv',delimiter='\t',names=['label','Messages'])

In [3]:
data.head()

Unnamed: 0,label,Messages
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
data.shape

(5572, 2)

In [5]:
data['label']=data['label'].map({'ham':0,'spam':1})

In [6]:
data.head()

Unnamed: 0,label,Messages
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
data.label.value_counts()

0    4825
1     747
Name: label, dtype: int64

### clean and Prepare data

In [8]:
import re

def clean(x):
    
    #remove all html tags from data
    #remove all numbers from data
    #remove all special chars from data
    #etc..
    s = re.sub('[^A-Za-z]',' ',x)  #to replace everything except A-Z or a-z
    s = re.sub('\s+',' ',s)
    s = s.strip()
    
    return s.lower()

In [9]:
data['Messages'] = data.Messages.apply(clean)

In [10]:
X = data.Messages.values
y = data.label.values

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
xtrain,xtest,ytrain,ytest = train_test_split(X,y,test_size=.25,random_state=12)

In [13]:
from nltk.corpus import stopwords

stopwords = stopwords.words('english')

In [14]:
### Remove 'not' from stopword list

In [15]:
if 'not' in stopwords:
    stopwords.remove('not')

In [16]:
## confirm 

if 'not' in stopwords:
    print('found')
else:
    print('not found')

not found


### Transform text data into Numeric

In [17]:
from sklearn.feature_extraction.text import CountVectorizer

In [18]:
cv1 = CountVectorizer(stop_words=stopwords)  

In [19]:
cv_train = cv1.fit_transform(xtrain).toarray()

cv_test = cv1.transform(xtest).toarray()

In [20]:
#cv1.get_feature_names()

In [21]:
cv_train.shape

(4179, 6489)

In [22]:
cv_test.shape

(1393, 6489)

In [23]:
#cv1.get_feature_names()

### Train model

In [24]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import confusion_matrix,recall_score,precision_score

import numpy as np

### Naive bayes

In [25]:
nb = MultinomialNB()

nb.fit(cv_train,ytrain)

test_score = nb.score(cv_test,ytest)
test_score

0.9741564967695621

In [26]:
pred = nb.predict(cv_test)

In [27]:
np.bincount(ytest)

array([1197,  196], dtype=int64)

In [28]:
confusion_matrix(ytest,pred)

array([[1182,   15],
       [  21,  175]], dtype=int64)

In [29]:
recall_score(ytest,pred)

0.8928571428571429

In [30]:
precision_score(ytest,pred)

0.9210526315789473

### Logistic Regression

In [31]:
from sklearn.model_selection import GridSearchCV

In [32]:
#nb = MultinomialNB()
log = LogisticRegression(C=.01,class_weight={1:3})

log.fit(cv_train,ytrain)

test_score = log.score(cv_test,ytest)
test_score

0.9540559942569993

In [33]:
log_pred = log.predict(cv_test)

In [34]:
confusion_matrix(ytest,log_pred)

array([[1182,   15],
       [  49,  147]], dtype=int64)

In [35]:
recall_score(ytest,log_pred)

0.75

In [36]:
precision_score(ytest,log_pred)

0.9074074074074074

### Evaluate model on test data

In [37]:
test = ['Get free tickets..!Win cash','hi john I will call you later']

In [38]:
cleaned_data=[]

for i in test:
    t = clean(i)
    cleaned_data.append(t)

In [39]:
cleaned_data

['get free tickets win cash', 'hi john i will call you later']

In [40]:
t1 = cv1.transform(cleaned_data)

In [41]:
t1.shape

(2, 6489)

In [42]:
nb.predict(t1)

array([1, 0], dtype=int64)

In [43]:
log.predict(t1)

array([0, 0], dtype=int64)

**Note:**

- Model has classified both samples correctly

In [44]:
############################## Trying Naive Bayes method to classify on moview review-sentiment datasets #####################

In [45]:
import pandas as pd

In [46]:
df = pd.read_csv('IMDB Dataset.csv')

In [47]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [48]:
df['sentiment']=df['sentiment'].map({'negative':0,'positive':1})

In [49]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [50]:
df.sentiment.value_counts()

1    25000
0    25000
Name: sentiment, dtype: int64

In [51]:
df.shape

(50000, 2)

In [52]:
df = df.sample(frac=1).reset_index(drop=True)

In [53]:
df.head(10)

Unnamed: 0,review,sentiment
0,Think of the ending of the Grudge 2 with the f...,0
1,Arg. The shuffling dinosaurs are back to take ...,0
2,This movie provided NOTHING new or worthwhile....,0
3,"From very long, we are seeing movies on Gandhi...",1
4,I loved this show. I think the first time I tr...,1
5,I have to say that I used to be a huge fan of ...,0
6,Final Score: 1.8 (out of 10)<br /><br />After ...,0
7,I walked out of this movie and I did this only...,0
8,This a good episode of The New Twilight Zone t...,1
9,Welcome to Collinwood is one of the most delig...,1


In [54]:
## Data Clean

In [55]:
import re

def clean(x):
    
    #remove all html tags from data
    #remove all numbers from data
    #remove all special chars from data
    #etc..
    s = re.sub('[^A-Za-z]',' ',x)  #to replace everything except A-Z or a-z
    s = re.sub('\s+',' ',s)
    s = s.strip()
    
    return s.lower()



In [56]:
df['review'] = df.review.apply(clean)

In [57]:
# data splitting

In [58]:
train_input,train_output = df.iloc[:10000,0],df.iloc[:10000,1]
test_input,test_output = df.iloc[10000:15000,0],df.iloc[10000:15000,1]

In [59]:
train_input.shape

(10000,)

In [60]:
test_input.shape

(5000,)

In [61]:
## Transform data into numeric form

In [62]:
from nltk.corpus import stopwords

stopwords = stopwords.words('english')

In [63]:
### Remove 'not' from stopword list

In [64]:
if 'not' in stopwords:
    stopwords.remove('not')

In [65]:
## confirm 

if 'not' in stopwords:
    print('found')
else:
    print('not found')

not found


In [66]:
from sklearn.feature_extraction.text import CountVectorizer

In [67]:
cv1 = CountVectorizer(stop_words=stopwords)  

In [68]:
cv_train = cv1.fit_transform(train_input).toarray()

cv_test = cv1.transform(test_input).toarray()

In [69]:
#cv1.get_feature_names()

In [70]:
cv_train.shape

(10000, 51025)

In [71]:
cv_test.shape

(5000, 51025)

In [72]:
#cv1.get_feature_names()

In [73]:
ytrain.shape

(4179,)

In [74]:
## Train model

In [75]:
from sklearn.naive_bayes import MultinomialNB,GaussianNB,BernoulliNB

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import confusion_matrix,recall_score,precision_score

import numpy as np

In [76]:
nb = MultinomialNB()

nb.fit(cv_train,train_output)

MultinomialNB()

In [77]:
test_score = nb.score(cv_test,test_output)
test_score

0.8474

In [78]:
pred = nb.predict(cv_test)

In [79]:
np.bincount(test_output)

array([2467, 2533], dtype=int64)

In [80]:
test_output.value_counts()

1    2533
0    2467
Name: sentiment, dtype: int64

In [81]:
confusion_matrix(test_output,pred)

array([[2149,  318],
       [ 445, 2088]], dtype=int64)

In [82]:
recall_score(test_output,pred)

0.8243189893407027

In [83]:
precision_score(test_output,pred)

0.8678304239401496

In [84]:
#Evaluate model on sample dataset

In [85]:
test = ['The Movie@# wasss AmaZIng','It had %$Boring ScripT $%#']

In [86]:
cleaned_data=[]

for i in test:
    t = clean(i)
    cleaned_data.append(t)

In [87]:
cleaned_data

['the movie wasss amazing', 'it had boring script']

In [88]:
t1 = cv1.transform(cleaned_data)

In [89]:
t1.shape

(2, 51025)

In [90]:
nb.predict(t1)

array([1, 0], dtype=int64)