In [4]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score

In [11]:
data = pd.read_csv('D:/Music/Music/mail_data.csv')
data

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [12]:
data.isnull().sum()

Category    0
Message     0
dtype: int64

In [13]:
#Labelling Category columns with binary values
data['Category'] = data['Category'].map({'spam': 0, 'ham': 1})
data

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,0,This is the 2nd time we have tried 2 contact u...
5568,1,Will ü b going to esplanade fr home?
5569,1,"Pity, * was in mood for that. So...any other s..."
5570,1,The guy did some bitching but I acted like i'd...


In [14]:
#Splitting the data
X = data['Message']

Y = data['Category']
X

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object

In [15]:
Y

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: int64

In [16]:
#Splitting data into train & test data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size = 0.8, random_state = 1)

In [18]:
print(X.shape, X_train.shape, X_test.shape)

(5572,) (4457,) (1115,)


In [29]:
Y_train

1642    1
2899    1
480     1
3485    1
157     1
       ..
905     1
5192    1
3980    1
235     0
5157    1
Name: Category, Length: 4457, dtype: int64

In [24]:
#Converting text to mumerical values
feature_extraction = TfidfVectorizer(min_df = 1, stop_words = 'english', lowercase = True)

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

In [25]:
X_train

1642    Hi , where are you? We're at  and they're not ...
2899          If you r @ home then come down within 5 min
480     When're you guys getting back? G said you were...
3485    Tell my  bad character which u Dnt lik in me. ...
157                           I'm leaving my house now...
                              ...                        
905     We're all getting worried over here, derek and...
5192    Oh oh... Den muz change plan liao... Go back h...
3980    CERI U REBEL! SWEET DREAMZ ME LITTLE BUDDY!! C...
235     Text & meet someone sexy today. U can find a d...
5157                              K k:) sms chat with me.
Name: Message, Length: 4457, dtype: object

In [27]:
print(X_train_features)

  (0, 4346)	0.3238008504874723
  (0, 2353)	0.2545072203817634
  (0, 6695)	0.3616065178053154
  (0, 5895)	0.415102954745324
  (0, 2719)	0.299459437576315
  (0, 3822)	0.37729728742748087
  (0, 3789)	0.4750235197588447
  (0, 3321)	0.2638802854739516
  (1, 4343)	0.6555659308129219
  (1, 1858)	0.5163195438969705
  (1, 3365)	0.5510421389942982
  (2, 4267)	0.531599749449541
  (2, 6215)	0.43979370278404856
  (2, 6597)	0.40097414833733686
  (2, 5672)	0.32606636481997364
  (2, 3025)	0.3502912545366897
  (2, 3185)	0.3663054742561573
  (3, 4447)	0.18080236341909536
  (3, 3084)	0.14346439189216004
  (3, 2877)	0.30030357190007717
  (3, 5515)	0.16439483489485024
  (3, 6922)	0.13381964389308706
  (3, 7080)	0.19700844583868773
  (3, 5535)	0.30030357190007717
  (3, 7398)	0.19877707762085306
  :	:
  (4454, 397)	0.31848634658760416
  (4454, 4027)	0.2561192223695296
  (4454, 6409)	0.2511086901671169
  (4454, 7382)	0.23350338191116915
  (4454, 4578)	0.28626353932821713
  (4455, 6600)	0.3164025961524856
  (4

In [30]:
#Training model
model = LogisticRegression()
model.fit(X_train_features, Y_train)

LogisticRegression()

In [33]:
#Evaluating model
y_pred = model.predict(X_test_features)
score = accuracy_score(Y_test, y_pred)
print("Accuracy on test data: {:.2f}%".format(score*100))

Accuracy on test data: 97.04%


In [35]:
#Evaluating model on train
y_pred1 = model.predict(X_train_features)
score = accuracy_score(Y_train, y_pred1)
print("Accuracy on train data: {:.2f}%".format(score*100))

Accuracy on train data: 96.81%


In [42]:
#Building a predictive system
input_mail = ["England v Macedonia - dont miss the goals/team news. Txt ur national team to 87077 eg ENGLAND to 87077 Try:WALES, SCOTLAND 4txt/Ãº1.20 POBOXox36504W45WQ 16+"]
              
input_data_feature = feature_extraction.transform(input_mail)
predict = model.predict(input_data_feature)


if predict[0] == 1:
    print('Not spam')
else:
    print('Spam!!!')

Not spam


In [43]:
input_mail = ["URGENT! You have won a 1 week FREE membership in our Â£100,000 Prize Jackpot! Txt the word: CLAIM to No: 81010 T&C www.dbuk.net LCCLTD POBOX 4403LDNW1A7RW18"]

input_data_feature = feature_extraction.transform(input_mail)
predict = model.predict(input_data_feature)


if predict[0] == 1:
    print('Not spam')
else:
    print('Spam!!!')

Spam!!!
