<a href="https://colab.research.google.com/github/Rolexx11/project_01/blob/main/Spam_E_Mail_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<a id="Import"></a>
<p style="background-color: #008080; font-family: 'Arial', sans-serif; color: #FFFFFF; font-size:160%; text-align:center; padding: 15px 25px; margin-top: 25px; border: 4px solid #20B2AA; border-left: 15px solid #FF6347; border-right: none; border-top: none; border-bottom: none; box-shadow: 0px 5px 10px rgba(0, 0, 0, 0.2);">Import Libs & Load dataset</p>

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

Data Collection & Pre-Processing

In [None]:
# loading the data from csv file to a pandas Dataframe
df= pd.read_csv('/content/mail_data.csv')

In [None]:
df

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [None]:
# printing the first 5 rows of the dataframe
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
# checking the number of rows and columns in the dataframe
df.shape

(5572, 2)

Label Encoding

In [None]:
# label spam mail as 0;  ham mail as 1;
df.loc[df['Category']=='spam','Category',] = 0
df.loc[df['Category']=='ham','Category',] = 1
df

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,0,This is the 2nd time we have tried 2 contact u...
5568,1,Will ü b going to esplanade fr home?
5569,1,"Pity, * was in mood for that. So...any other s..."
5570,1,The guy did some bitching but I acted like i'd...


spam  -  0

ham  -  1

In [None]:
# separating the data as texts and label

X = df['Message']

Y = df['Category']

In [None]:
print(X)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object


In [None]:
print(Y)

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: object


Splitting the data into training data & test data

<a id="Import"></a>
<p style="background-color: #008080; font-family: 'Arial', sans-serif; color: #FFFFFF; font-size:160%; text-align:center; padding: 15px 25px; margin-top: 25px; border: 4px solid #20B2AA; border-left: 15px solid #FF6347; border-right: none; border-top: none; border-bottom: none; box-shadow: 0px 5px 10px rgba(0, 0, 0, 0.2);">Train Test Split</p>

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
print(X.shape)
print(X_train.shape)
print(X_test.shape)

(5572,)
(4457,)
(1115,)


## Feature Extraction

In [None]:
# transform the text data to feature vectors that can
#be used as input to the Logistic regression
vectorizer = TfidfVectorizer(binary=True)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [None]:
print(X_train_vec)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 59275 stored elements and shape (4457, 7701)>
  Coords	Values
  (0, 5687)	0.18065415760880438
  (0, 6888)	0.08450763080336647
  (0, 7474)	0.2068160316031145
  (0, 258)	0.22637078421372198
  (0, 7396)	0.24262542722266756
  (0, 7437)	0.18812242367597887
  (0, 7471)	0.14464036825485033
  (0, 6773)	0.10426535652900155
  (0, 354)	0.33718090071346507
  (0, 2805)	0.33718090071346507
  (0, 7555)	0.2310692701909831
  (0, 2106)	0.2886801782762419
  (0, 1271)	0.14345310305440925
  (0, 3364)	0.32148331772220545
  (0, 5980)	0.16704978726648753
  (0, 6460)	0.18065415760880438
  (0, 694)	0.3017067020214299
  (0, 2568)	0.23233872828920074
  (0, 5999)	0.21579023185218274
  (1, 7471)	0.15345712397145647
  (1, 3369)	0.23788700384827438
  (1, 6304)	0.28771182434641
  (1, 4854)	0.13675532425050171
  (1, 4976)	0.1691979803531827
  (1, 3603)	0.12054493078419984
  :	:
  (4452, 7535)	0.32456891254667847
  (4452, 2502)	0.37405731881436316
  (4452, 75

In [None]:
print(X_train)

1978    Reply to win £100 weekly! Where will the 2006 ...
3989    Hello. Sort of out in town already. That . So ...
3935     How come guoyang go n tell her? Then u told her?
4078    Hey sathya till now we dint meet not even a si...
4086    Orange brings you ringtones from all time Char...
                              ...                        
3772    Hi, wlcome back, did wonder if you got eaten b...
5191                               Sorry, I'll call later
5226        Prabha..i'm soryda..realy..frm heart i'm sory
5390                           Nt joking seriously i told
860               Did he just say somebody is named tampa
Name: Message, Length: 4457, dtype: object


In [None]:
print(X)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object


## Logistic Regression

In [None]:
model = LogisticRegression()

In [None]:
# training the Logistic Regression model with the training data
model.fit(X_train_vec, Y_train)

<a id="Import"></a>
<p style="background-color: #008080; font-family: 'Arial', sans-serif; color: #FFFFFF; font-size:160%; text-align:center; padding: 15px 25px; margin-top: 25px; border: 4px solid #20B2AA; border-left: 15px solid #FF6347; border-right: none; border-top: none; border-bottom: none; box-shadow: 0px 5px 10px rgba(0, 0, 0, 0.2);">accuracy score on the training & testing data
</p>

In [None]:
# prediction on training data
prediction_on_training_data = model.predict(X_train_vec)
accuracy_on_training_data = accuracy_score(Y_train, prediction_on_training_data)

In [None]:
print('Accuracy on training data : ', accuracy_on_training_data)

Accuracy on training data :  0.9744222571236257


In [None]:
# prediction on test data
prediction_on_test_data =model.predict(X_test_vec)

accuracy_on_testing_data = accuracy_score(Y_test, prediction_on_test_data)

In [None]:
print('Accuracy on test data : ', accuracy_on_testing_data)

Accuracy on test data :  0.9730941704035875


## Building a Predictive System

In [None]:
input_mail = ["Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"]

In [None]:
input_mail_vec=vectorizer.transform(input_mail)
prediction = model.predict(input_mail_vec)

print(prediction)

if (prediction == 0) :
    print ('Ham Mail')
else :
    print('Spam Mail')

[0]
Ham Mail
