In [1]:
mkdir project1

In [2]:
pwd

'/content'

In [3]:
cd project1

/content/project1


In [4]:
import numpy as np      #for creating numpy arrays
import pandas as pd   #for dataframe creation
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer   #to transform text into the numerical data
from sklearn.linear_model import LogisticRegression     #for classification
from sklearn.metrics import accuracy_score    #for evaluations

# **Data Collection And Data Preprocessing**

In [5]:
df = pd.read_csv('/content/project1/mail_data.csv')
df

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [6]:
print(df)

     Category                                            Message
0         ham  Go until jurong point, crazy.. Available only ...
1         ham                      Ok lar... Joking wif u oni...
2        spam  Free entry in 2 a wkly comp to win FA Cup fina...
3         ham  U dun say so early hor... U c already then say...
4         ham  Nah I don't think he goes to usf, he lives aro...
...       ...                                                ...
5567     spam  This is the 2nd time we have tried 2 contact u...
5568      ham               Will ü b going to esplanade fr home?
5569      ham  Pity, * was in mood for that. So...any other s...
5570      ham  The guy did some bitching but I acted like i'd...
5571      ham                         Rofl. Its true to its name

[5572 rows x 2 columns]


In [7]:
#replacing the null values with a null string
maildata = df.where((pd.notnull(df)), '')
maildata

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [8]:
maildata.shape

(5572, 2)

# **Label Encoding**
Labelling a spam mail as 0 and ham mail as 1.

In [9]:
maildata.loc[maildata['Category'] == 'spam', 'Category'] = 0
maildata.loc[maildata['Category'] == 'ham', 'Category'] = 1

In [10]:
#Separating the data into text and labels so that we can feed it to our machine learning model to train it
X = maildata['Message']
Y = maildata['Category']

In [11]:
X   #input for our data(Features)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object

In [12]:
Y     #Label for our data(output data)

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: object

# **Performing Train-Test Split**

In [13]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state=3)

In [14]:
print(X.shape)
print(Y.shape)
print(X_train.shape, X_test.shape)
print(Y_train.shape, Y_test.shape)

(5572,)
(5572,)
(4457,) (1115,)
(4457,) (1115,)


# **Performing Feature Extraction**
Transforming or Converting our Text data into numerical data so that we can feed it to the Logistic Classifier model.

In [16]:
#TfidfVectorizer
#1. Calculating how many times each word is repeated
#2. Based on the calculated value or score it will predict the output
#3. It will ignore all the stop words present in the english
feature_extraction = TfidfVectorizer(min_df = 1, stop_words = 'english')

#converting our traindata or text data into features vectors/arrays and fitting it into the TfidfVectorizer
X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

In [17]:
Y_train = Y_train.astype(int)
Y_test = Y_test.astype(int)

In [18]:
print(X_train_features)   #same X_train data which is converted into numerical values as machine understands numerical values well as compared with text data

  (0, 5413)	0.6198254967574347
  (0, 4456)	0.4168658090846482
  (0, 2224)	0.413103377943378
  (0, 3811)	0.34780165336891333
  (0, 2329)	0.38783870336935383
  (1, 4080)	0.18880584110891163
  (1, 3185)	0.29694482957694585
  (1, 3325)	0.31610586766078863
  (1, 2957)	0.3398297002864083
  (1, 2746)	0.3398297002864083
  (1, 918)	0.22871581159877646
  (1, 1839)	0.2784903590561455
  (1, 2758)	0.3226407885943799
  (1, 2956)	0.33036995955537024
  (1, 1991)	0.33036995955537024
  (1, 3046)	0.2503712792613518
  (1, 3811)	0.17419952275504033
  (2, 407)	0.509272536051008
  (2, 3156)	0.4107239318312698
  (2, 2404)	0.45287711070606745
  (2, 6601)	0.6056811524587518
  (3, 2870)	0.5864269879324768
  (3, 7414)	0.8100020912469564
  (4, 50)	0.23633754072626942
  (4, 5497)	0.15743785051118356
  :	:
  (4454, 4602)	0.2669765732445391
  (4454, 3142)	0.32014451677763156
  (4455, 2247)	0.37052851863170466
  (4455, 2469)	0.35441545511837946
  (4455, 5646)	0.33545678464631296
  (4455, 6810)	0.29731757715898277
  (4

# **Training & Evaluating the Logistic Regression Model**

In [19]:
model = LogisticRegression()
model.fit(X_train_features, Y_train)

In [20]:
#Evaluating our model

ypred = model.predict(X_train_features)
accuracyScore = accuracy_score(Y_train, ypred)

In [21]:
print('Accuracy of our model is: ', accuracyScore)

Accuracy of our model is:  0.9670181736594121


In [22]:
#Evaluating our model

ypredonTestdata = model.predict(X_test_features)
accuracyScoretestdata = accuracy_score(Y_test, ypredonTestdata)

In [23]:
print('Accuracy of our model on Test Data is: ', accuracyScoretestdata)

Accuracy of our model on Test Data is:  0.9659192825112107


In [26]:
input_mail = ["Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"]
#converting the text to feature vectors
inputdatafeature = feature_extraction.transform(input_mail)

predict = model.predict(inputdatafeature)
print(predict)

if predict == 1:
  print('Ham Mail')

else:
  print('Spam Mail')

[0]
Spam Mail
