In [64]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [97]:
df = pd.read_csv('maill_data.csv')

In [98]:
print(df)

     Category                                            Message
0         ham  Go until jurong point, crazy.. Available only ...
1         ham                      Ok lar... Joking wif u oni...
2        spam  Free entry in 2 a wkly comp to win FA Cup fina...
3         ham  U dun say so early hor... U c already then say...
4         ham  Nah I don't think he goes to usf, he lives aro...
...       ...                                                ...
5567     spam  This is the 2nd time we have tried 2 contact u...
5568      ham               Will ü b going to esplanade fr home?
5569      ham  Pity, * was in mood for that. So...any other s...
5570      ham  The guy did some bitching but I acted like i'd...
5571     spam     WINNER!! you have won the lottery of $1200000.

[5572 rows x 2 columns]


In [99]:
data = df.where((pd.notnull(df)), ' ')

In [100]:
data.head(10)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [101]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [102]:
data.shape

(5572, 2)

In [103]:
data.loc[data['Category'] == 'spam', 'Category',]=0
data.loc[data['Category'] == 'ham', 'Category',]=1

In [104]:
print(data)

     Category                                            Message
0           1  Go until jurong point, crazy.. Available only ...
1           1                      Ok lar... Joking wif u oni...
2           0  Free entry in 2 a wkly comp to win FA Cup fina...
3           1  U dun say so early hor... U c already then say...
4           1  Nah I don't think he goes to usf, he lives aro...
...       ...                                                ...
5567        0  This is the 2nd time we have tried 2 contact u...
5568        1               Will ü b going to esplanade fr home?
5569        1  Pity, * was in mood for that. So...any other s...
5570        1  The guy did some bitching but I acted like i'd...
5571        0     WINNER!! you have won the lottery of $1200000.

[5572 rows x 2 columns]


In [105]:
X=data['Message']
Y=data['Category']

In [106]:
print(X)


0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571       WINNER!! you have won the lottery of $1200000.
Name: Message, Length: 5572, dtype: object


In [107]:
print(Y)

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    0
Name: Category, Length: 5572, dtype: object


In [108]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2, random_state=3)

In [109]:
print(X.shape)
print(X_train.shape)
print(X_test.shape)

(5572,)
(4457,)
(1115,)


In [110]:
print(Y.shape)
print(Y_train.shape)
print(Y_test.shape)

(5572,)
(4457,)
(1115,)


In [111]:
#to convert text to feature vectors that will be used as input for the linear regression
feature_extraction=TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)
#Use fit_transform on your training data to both learn the model parameters and transform the data.
#Use transform on new data to transform it based on the already learned parameters from the training data.
X_train_features=feature_extraction.fit_transform(X_train)
X_test_features=feature_extraction.transform(X_test)
Y_train=Y_train.astype('int')
Y_test=Y_test.astype('int')

In [112]:
print(X_train_features)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 34777 stored elements and shape (4457, 7433)>
  Coords	Values
  (0, 2330)	0.38783870336935383
  (0, 3812)	0.34780165336891333
  (0, 2225)	0.413103377943378
  (0, 4458)	0.4168658090846482
  (0, 5415)	0.6198254967574347
  (1, 3812)	0.17419952275504033
  (1, 3047)	0.2503712792613518
  (1, 1992)	0.33036995955537024
  (1, 2957)	0.33036995955537024
  (1, 2759)	0.3226407885943799
  (1, 1840)	0.2784903590561455
  (1, 919)	0.22871581159877646
  (1, 2747)	0.3398297002864083
  (1, 2958)	0.3398297002864083
  (1, 3326)	0.31610586766078863
  (1, 3186)	0.29694482957694585
  (1, 4082)	0.18880584110891163
  (2, 6603)	0.6056811524587518
  (2, 2405)	0.45287711070606745
  (2, 3157)	0.4107239318312698
  (2, 408)	0.509272536051008
  (3, 7416)	0.8100020912469564
  (3, 2871)	0.5864269879324768
  (4, 2871)	0.41872147309323743
  (4, 488)	0.2899118421746198
  :	:
  (4454, 2856)	0.47210665083641806
  (4454, 2247)	0.47210665083641806
  (4455, 4458)	0.24

In [113]:
Model= LogisticRegression()


In [114]:
Model.fit(X_train_features,Y_train)

In [115]:
prediction_on_training_data=Model.predict(X_train_features)
accuracy_on_training_data=accuracy_score(Y_train,prediction_on_training_data)

In [116]:
print('Accuracy of training data:',  accuracy_on_training_data)

Accuracy of training data: 0.9679156383217411


In [117]:
prediction_on_testing_data=Model.predict(X_test_features)
accuracy_on_testing_data=accuracy_score(Y_test, prediction_on_testing_data)

In [118]:
print('Accuracy of testing data:', accuracy_on_testing_data)

Accuracy of testing data: 0.9668161434977578


In [123]:
input_mail=[' Important information for 02 user. Today is your lucky day! 2 find out why , log onto http://www.urawinner.com there is a fantastic surprise awaiting you !']
input_mail_features=feature_extraction.transform(input_mail)
prediction=Model.predict(input_mail_features)

In [124]:
print(prediction)
if(prediction[0]==1):
    print('IT IS NOT A SPAM')
else:
    print('IT IS A SPAM')

[0]
IT IS A SPAM
