In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.pipeline import Pipeline 


spam_data = pd.read_csv('./spam.csv',encoding = "latin-1", engine='python')
spam_data = spam_data.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=1)
spam_data.head(5)
spam_data.describe()
spam_data.groupby('Class').describe()

Unnamed: 0_level_0,Text,Text,Text,Text
Unnamed: 0_level_1,count,unique,top,freq
Class,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,653,Please call our customer service representativ...,4


In [2]:
cv_model=CountVectorizer()
word_count=cv_model.fit_transform(spam_data.Text)
word_count.shape


(5572, 8672)

In [3]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(word_count)

df_idf = pd.DataFrame(tfidf_transformer.idf_, index=cv_model.get_feature_names(),columns=["idf_weights"])

df_idf.sort_values(by=['idf_weights'])


Unnamed: 0,idf_weights
to,2.198545
you,2.254829
the,2.689346
in,2.933605
and,2.947347
...,...
bleh,8.932542
mee,8.932542
blimey,8.932542
mirror,8.932542


In [4]:
count_vector=cv_model.transform(spam_data.Text)
tf_idf_vector=tfidf_transformer.transform(count_vector)

feature_names = cv_model.get_feature_names()
i = 0
for x in tf_idf_vector:
    i+= 1
    df = pd.DataFrame(x.T.todense(), index=feature_names, columns=["tfidf"])
    print(df.sort_values(by=["tfidf"],ascending=False))
    if i == 3:
        break



                tfidf
jurong       0.326425
amore        0.326425
buffet       0.311608
bugis        0.275765
cine         0.275765
...               ...
electricity  0.000000
elections    0.000000
election     0.000000
eldest       0.000000
ûówell       0.000000

[8672 rows x 1 columns]
             tfidf
oni       0.546588
joking    0.523646
wif       0.431601
lar       0.408299
ok        0.272120
...            ...
election  0.000000
eldest    0.000000
elaya     0.000000
elama     0.000000
ûówell    0.000000

[8672 rows x 1 columns]
                      tfidf
fa                 0.460253
entry              0.352710
08452810075over18  0.230126
2005               0.222362
21st               0.222362
...                     ...
electricity        0.000000
elections          0.000000
election           0.000000
eldest             0.000000
ûówell             0.000000

[8672 rows x 1 columns]


In [10]:
x_train, x_test, y_train, y_test = train_test_split(spam_data['Text'],spam_data['Class'],test_size=0.2)

model = Pipeline([
    ('bow',CountVectorizer()),
    ('tfidf',TfidfTransformer()),
    ('classifier',MultinomialNB())
])

model.fit(x_train,y_train)

prediction = model.predict(x_test)

print(classification_report(y_test,prediction))

              precision    recall  f1-score   support

         ham       0.96      1.00      0.98       973
        spam       1.00      0.75      0.85       142

    accuracy                           0.97      1115
   macro avg       0.98      0.87      0.92      1115
weighted avg       0.97      0.97      0.97      1115



In [20]:
from sklearn.linear_model.logistic import LogisticRegression
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_train_x = vectorizer.fit_transform(x_train)
classifier = LogisticRegression()
classifier.fit(tfidf_train_x, y_train)



tfidf_test_x = vectorizer.transform(x_test)
print(tfidf_test_x.shape)
scores = cross_val_score(classifier, tfidf_test_x, y_test, cv=5)
acc = scores.mean()
print(acc)

(1115, 7394)
0.8887892376681614


In [32]:
##Verify if the model is proper

text = ['Win $10,000', "This is important"]
output = classifier.predict(vectorizer.transform(text))

for i ,m in enumerate(text):
	print(output[i], ':', m)

spam : Win $10,000
ham : This is important
