## Importing libraries

In [1]:
import numpy as np
import pandas as pd

## Importing data 

In [2]:
email = pd.read_csv('spam_email_detection.csv')
print(email)

        v1                                                 v2
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...
...    ...                                                ...
5567  spam  This is the 2nd time we have tried 2 contact u...
5568   ham              Will �_ b going to esplanade fr home?
5569   ham  Pity, * was in mood for that. So...any other s...
5570   ham  The guy did some bitching but I acted like i'd...
5571   ham                         Rofl. Its true to its name

[5572 rows x 2 columns]


## Basic checks on data

In [3]:
email.shape

(5572, 2)

In [4]:
email.isnull().any()

v1    False
v2    False
dtype: bool

In [5]:
email.isnull().sum()

v1    0
v2    0
dtype: int64

In [6]:
email.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   v1      5572 non-null   object
 1   v2      5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


## Label Encoding 

## Converting ham = 0 and spam = 1

In [7]:
email.loc[email['v1'] == 'ham', 'v1'] = 0
email.loc[email['v1'] == 'spam', 'v1'] = 1

In [8]:
email.head()

Unnamed: 0,v1,v2
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


## Seperating Dependent variable and independent variables

In [9]:
X = email['v2']
y = email['v1']

In [10]:
print(X)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                Will �_ b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: v2, Length: 5572, dtype: object


In [11]:
print(y)

0       0
1       0
2       1
3       0
4       0
       ..
5567    1
5568    0
5569    0
5570    0
5571    0
Name: v1, Length: 5572, dtype: object


## Importing sklearn library functions

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

## Splitting training and testing sets

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 42)
print(X.shape)
print(X_train.shape)
print(X_test.shape)

(5572,)
(4457,)
(1115,)


## Converting Text data to Numerical value

In [14]:
f_extraction = TfidfVectorizer(min_df = 1, stop_words = 'english', lowercase = True)

X_train_feature = f_extraction.fit_transform(X_train)
X_test_feature = f_extraction.transform(X_test)

In [15]:
y_train = y_train.astype('int')
y_test = y_test.astype('int')

In [16]:
print(X_train)

1978    No I'm in the same boat. Still here at my moms...
3989    (Bank of Granite issues Strong-Buy) EXPLOSIVE ...
3935       They r giving a second chance to rahul dengra.
4078       O i played smash bros  &lt;#&gt;  religiously.
4086    PRIVATE! Your 2003 Account Statement for 07973...
                              ...                        
3772    I came hostel. I m going to sleep. Plz call me...
5191                               Sorry, I'll call later
5226        Prabha..i'm soryda..realy..frm heart i'm sory
5390                           Nt joking seriously i told
860                   In work now. Going have in few min.
Name: v2, Length: 4457, dtype: object


In [17]:
print(X_train_feature)

  (0, 4502)	0.4658046386365619
  (0, 3199)	0.348722265231364
  (0, 7389)	0.348722265231364
  (0, 1702)	0.3431839629173582
  (0, 4397)	0.4528381701109944
  (0, 1371)	0.4658046386365619
  (1, 0)	0.2654936554684193
  (1, 1645)	0.3059746053542906
  (1, 6419)	0.2953742837684993
  (1, 4515)	0.3059746053542906
  (1, 419)	0.28715203556385105
  (1, 4273)	0.2953742837684993
  (1, 4986)	0.1937920260229529
  (1, 2652)	0.3059746053542906
  (1, 1533)	0.2015782058421696
  (1, 6275)	0.269833648032668
  (1, 3617)	0.2804339696184593
  (1, 3129)	0.3059746053542906
  (1, 1187)	0.26161139982801973
  (2, 2184)	0.5102109014477275
  (2, 5331)	0.5102109014477275
  (2, 1670)	0.35156722029872034
  (2, 5750)	0.3962151014046925
  (2, 3050)	0.44585171875646595
  (3, 5464)	0.4829129976175997
  :	:
  (4451, 5719)	0.3358090891373877
  (4451, 4668)	0.3478605253385091
  (4452, 3389)	0.4536077050510107
  (4452, 3410)	0.4833413012939851
  (4452, 1576)	0.3576443319642905
  (4452, 1777)	0.3311324953642251
  (4452, 5978)	0.3

## Importing Logistic Regression Algorithm and fitting it into Training dataset

In [18]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression()

In [19]:
log_reg.fit(X_train_feature, y_train)

## Model Predicting

In [20]:
y_pred = log_reg.predict(X_test_feature)
y_pred

array([0, 0, 0, ..., 0, 0, 0])

## Importing metrics to evaluate the model

In [21]:
from sklearn.metrics import accuracy_score, precision_score

In [22]:
print('accuracy_score : ', accuracy_score(y_test,y_pred)*100)
print('precision_score : ', precision_score(y_test,y_pred)*100)

accuracy_score :  95.24663677130046
precision_score :  97.0873786407767


## Input mail to evaluate model

In [25]:
input_email = ['SIX chances to win CASH! From 100 to 20,000 pounds txt> CSH11 and send to 87575. Cost 150p/day, 6days, 16+ TsandCs apply Reply HL 4 info']

input_data_feature = f_extraction.transform(input_email)

pred = log_reg.predict(input_data_feature)

print(pred)

[1]
