In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [3]:
raw_mail_data=pd.read_csv("/content/email_data.csv")
print(raw_mail_data)

      category                                            message
0     not spam  report for review. Figure meeting society adul...
1     not spam  shipping confirmation. According this among br...
2     not spam    meeting reminder. None surface mind say Mr own.
3     not spam  your recent order. Case performance if mind be...
4     not spam  vacation notice. Use far about bed. Team make ...
...        ...                                                ...
5495      spam  Vote only standard soon career usually off. do...
5496  not spam  event invitation. True especially popular succ...
5497      spam  Worker give thank. guaranteed income claim you...
5498      spam  Station shoulder check deal. win money debt re...
5499  not spam  meeting reminder. Her help risk attack girl. S...

[5500 rows x 2 columns]


In [8]:
mail_data=raw_mail_data.where((pd.notnull(raw_mail_data)),"")
mail_data.head(5)


Unnamed: 0,category,message
0,not spam,report for review. Figure meeting society adul...
1,not spam,shipping confirmation. According this among br...
2,not spam,meeting reminder. None surface mind say Mr own.
3,not spam,your recent order. Case performance if mind be...
4,not spam,vacation notice. Use far about bed. Team make ...


In [9]:
mail_data.shape

(5500, 2)

In [None]:
#Label Encding spam as 0 and not spam as 1
mail_data.loc[mail_data['category']=='spam','category',]=0
mail_data.loc[mail_data['category']=="not spam",'category']=1

In [None]:
# separating text and label
x=mail_data['message']
y=mail_data['category']
x

Unnamed: 0,message
0,report for review. Figure meeting society adul...
1,shipping confirmation. According this among br...
2,meeting reminder. None surface mind say Mr own.
3,your recent order. Case performance if mind be...
4,vacation notice. Use far about bed. Team make ...
...,...
5495,Vote only standard soon career usually off. do...
5496,event invitation. True especially popular succ...
5497,Worker give thank. guaranteed income claim you...
5498,Station shoulder check deal. win money debt re...


In [None]:
#splitting data into training data and test data
x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.2,random_state=3)
print(x.shape)
print(x_test.shape)

(5500,)
(4400,)


In [None]:
#feature extraction 
#transform the text data to feature vectors for logistic regression 
feature_extraction=TfidfVectorizer(min_df=1, stop_words='english',lowercase=True)

In [20]:
x_train_features=feature_extraction.fit_transform(x_train)
x_test_features=feature_extraction.transform(x_test)


In [None]:
#convert y_train and Y_test to integers
y_train=y_train.astype("int")
y_test=y_test.astype("int")

In [22]:
print(x_train_features)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 13232 stored elements and shape (1100, 1231)>
  Coords	Values
  (0, 992)	0.36469573759875085
  (0, 208)	0.36469573759875085
  (0, 339)	0.43085508061404665
  (0, 61)	0.41707700505252276
  (0, 207)	0.41707700505252276
  (0, 78)	0.4477180822532249
  (1, 78)	0.32893946878738883
  (1, 1106)	0.20109628366237822
  (1, 586)	0.31125841134242843
  (1, 695)	0.32893946878738883
  (1, 1075)	0.322399967390537
  (1, 606)	0.2161511165724572
  (1, 698)	0.2161511165724572
  (1, 872)	0.21177707698437384
  (1, 1141)	0.20467956496242193
  (1, 220)	0.1997253105538421
  (1, 488)	0.13193889788056992
  (1, 1220)	0.13016991883936735
  (1, 87)	0.38339666828474017
  (1, 1186)	0.35503481751778415
  (1, 197)	0.12191606406727937
  (2, 850)	0.19996849322360427
  (2, 1140)	0.1776837406154488
  (2, 351)	0.2546218309845667
  (2, 266)	0.2546218309845667
  :	:
  (1098, 812)	0.3101328985556748
  (1098, 963)	0.2894045484261612
  (1098, 1212)	0.31540553617673855
 

In [None]:
#training the model
#logical regression 
model= LogisticRegression()
model.fit(x_train_features,y_train)

In [None]:
#evaluating the trained model
#prediction on trained model
prediction_data=model.predict(x_train_features)
accuracy_data=accuracy_score(y_train,prediction_data)
print("Accuracy of the training data: ",accuracy_data)

Accuracy of the training data:  1.0


In [None]:
#prediction on test model
prediction_test=model.predict(x_test_features)
accuracy_test=accuracy_score(y_test,prediction_test)
print("Accuracy of the training data: ",accuracy_test)


Accuracy of the training data:  1.0


In [None]:
#building a predictive system
input_mail=["Even my brother is not like to speak with me. They treat me like aids patent."]
#convert text to feature vectors
input_data_features=feature_extraction.transform(input_mail)

#making predictions
prediction=model.predict(input_data_features)
print(prediction)


[1]
