In [8]:
###################---LIBRARIES---###################

import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_auc_score


In [24]:
###################---FUNCTIONS---###################

def add_feature(X, feature_to_add):
    """
    Returns sparse feature matrix with added feature.
    feature_to_add can also be a list of features.
    """
    from scipy.sparse import csr_matrix, hstack
    return hstack([X, csr_matrix(feature_to_add).T], 'csr')

In [25]:
###################---IMPORTING AND SPLITTING DATA---###################

spam_data = pd.read_csv('spam.csv')

spam_data['target'] = np.where(spam_data['target']=='spam',1,0)
spam_data.head(10)

X_train, X_test, y_train, y_test = train_test_split(spam_data['text'],spam_data['target'],random_state=0)



In [28]:
###################---GET TO KNOW DATA---###################

print('\033[1mSOME INFORMATION ABOUT DATA:\033[0m\n')

###---Percentage of spam texts---### 
num_spam=spam_data['target'].value_counts()[1]
num_items=len(spam_data['target'])
print('{:3.2f}% of the texts are SPAM texts.\n'.format(num_spam/num_items*100))


###---The longest word---### 
vect = CountVectorizer().fit(X_train)
vect_feat=vect.get_feature_names()
len_tok = [(w,len(w)) for w in vect_feat ]
sort_tok=sorted(len_tok,reverse=True, key=lambda x: x[1])
print('The longest word in the whole data is',sort_tok[0][0],'\n')


###---20 largest and 20 smallest tfidfs---###  
vect = TfidfVectorizer().fit(X_train)
X_train_vectorized = vect.transform(X_train)
    
feature_names = np.array(vect.get_feature_names())
sorted_tfidf = X_train_vectorized.max(0).toarray()[0]
    
sorted_tfidf_index = sorted_tfidf.argsort()
    
smallest_df=pd.DataFrame(sorted_tfidf[sorted_tfidf_index[:20]])
smallest_df.set_index(feature_names[sorted_tfidf_index[:20]],inplace=True)
    
largest_df=pd.DataFrame(sorted_tfidf[sorted_tfidf_index[:-21:-1]])
largest_df.set_index(feature_names[sorted_tfidf_index[:-21:-1]],inplace=True)

print('The 20 words with SMALLEST tfidf Coefs:\n{}\n'.format(smallest_df.iloc[:,0]))
print('The 20 words with LARGEST tfidf Coefs: \n{}\n\n'.format(largest_df.iloc[:,0]))


###---Average number of words and digits in spam and non-spam texts---###
spam_data['char. count']= [len(sent) for sent in spam_data['text']]
spam_data['digit count']= [sum(c.isdigit() for c in s) for s in spam_data['text']]

spam=spam_data[spam_data['target']==1]
non_spam=spam_data[spam_data['target']==0]
# ------
mean_spam=np.mean(spam['char. count'])
mean_non_spam=np.mean(non_spam['char. count'])

print('''The mean nbr of words in NON-spam texts is {0:3.2f}.
The mean nbr of words in spam texts is {1:3.2f}.\n'''.format(mean_non_spam,mean_spam))
# ------
mean_dspam=np.mean(spam['digit count'])
mean_dnon_spam=np.mean(non_spam['digit count'])
    
print('''The mean nbr of digits in NON-spam texts is {0:3.2f}.
The mean nbr of digits in spam texts is {1:3.2f}.\n'''.format(mean_dnon_spam,mean_dspam))


###---Average number of non-word characters in spam and non-spam texts---### 
mean_Nnon_spam = non_spam.apply(lambda x:len(re.findall(r'\W',x[0])), axis=1).mean()
mean_Nspam = spam.apply(lambda x:len(re.findall(r'\W',x[0])), axis=1).mean()

print('''The mean nbr of non-word characters in NON-spam texts is {0:3.2f}.
The mean nbr of non-word characters in spam texts is {1:3.2f}.'''.format(mean_Nnon_spam,mean_Nspam))


[1mSOME INFORMATION ABOUT DATA:[0m

13.41% of the texts are SPAM texts.

The longest word in the whole data is com1win150ppmx3age16subscription 

The 20 words with SMALLEST tfidf Coefs:
sympathetic     0.074475
healer          0.074475
aaniye          0.074475
dependable      0.074475
companion       0.074475
listener        0.074475
athletic        0.074475
exterminator    0.074475
psychiatrist    0.074475
pest            0.074475
determined      0.074475
chef            0.074475
courageous      0.074475
stylist         0.074475
psychologist    0.074475
organizer       0.074475
pudunga         0.074475
venaam          0.074475
diwali          0.091250
mornings        0.091250
Name: 0, dtype: float64

The 20 words with LARGEST tfidf Coefs: 
146tf150p    1.000000
havent       1.000000
home         1.000000
okie         1.000000
thanx        1.000000
er           1.000000
anything     1.000000
lei          1.000000
nite         1.000000
yup          1.000000
thank        1.000000
ok   

In [33]:
###################---MODEL TESTING---###################
print('\033[1mTESTING 4 COMBINATIONS OF VECTORIZERS AND ALGORITHMS:\033[0m\n')

###---CountVectorizer and MultinomialNB to predict ytest---### 
vect = CountVectorizer().fit(X_train)
X_train_vectorized = vect.transform(X_train)
NB=MultinomialNB(alpha=0.1).fit(X_train_vectorized, y_train)
predictions = NB.predict(vect.transform(X_test))

print('The AUC score using CountVectorizer and Multinomial Naive Bayes is {:3.2f}.\n'
      .format(roc_auc_score(y_test, predictions)))


###---TfidfVectorizer and MultinomialNB to predict ytest---### 
vect = TfidfVectorizer(min_df=3).fit(X_train)
X_train_vectorized = vect.transform(X_train)  
NB=MultinomialNB(alpha=0.1).fit(X_train_vectorized, y_train)
predictions = NB.predict(vect.transform(X_test))

print('The AUC score using TfidfVectorizer and Multinomial Naive Bayes is {:3.2f}.\n'
      .format(roc_auc_score(y_test, predictions)))


###---TfidfVectorizer,adding features and SVM to predict ytest---### 
vect = TfidfVectorizer(min_df=5).fit(X_train)
X_train_vectorized = vect.transform(X_train)
X_test_vectorized=vect.transform(X_test)
x_len1 = X_train.apply(len)
X_train_aug = add_feature(X_train_vectorized, x_len1)
    
x_len2 = X_test.apply(len)
X_test_aug = add_feature(X_test_vectorized, x_len2)
    
SVC_model=SVC(C=10000,gamma='auto').fit(X_train_aug,y_train)
predictions = SVC_model.predict(X_test_aug)

print('The AUC score using TfidfVectorizer and SVM and adding ONE feature is {:3.2f}.\n'
      .format(roc_auc_score(y_test, predictions)))


###---TfidfVectorizer,adding features and LogisticRegression to predict ytest---### 
vect = TfidfVectorizer(min_df=5, ngram_range=(1,3)).fit(X_train)
X_train_vectorized = vect.transform(X_train)
X_test_vectorized = vect.transform(X_test)

x_len = X_train.apply(len)
X_train_aug = add_feature(X_train_vectorized, x_len)
x_digit = X_train.apply(lambda x: len(re.sub('\D','', x)))
X_train_aug2 = add_feature(X_train_aug, x_digit)

x_len2 = X_test.apply(len)
X_test_aug = add_feature(X_test_vectorized, x_len2)
x_digit2 = X_test.apply(lambda x: len(re.sub('\D','', x)))
X_test_aug2 = add_feature(X_test_aug, x_digit2)

model = LogisticRegression(C=100,solver='liblinear').fit(X_train_aug2, y_train)
predictions = model.predict(X_test_aug2)

print('The AUC score using TfidfVectorizer and Logistic Regression and adding TWO feature is {:3.2f}.\n'
      .format(roc_auc_score(y_test, predictions)))

[1mTESTING 4 COMBINATIONS OF VECTORIZERS AND ALGORITHMS:[0m

The AUC score using CountVectorizer and Multinomial Naive Bayes is 0.97.

The AUC score using TfidfVectorizer and Multinomial Naive Bayes is 0.94.

The AUC score using TfidfVectorizer and SVM and adding ONE feature is 0.96.

The AUC score using TfidfVectorizer and Logistic Regression and adding TWO feature is 0.97.



In [37]:
###################---TESTING VECTORIZING BEFORE AND AFTER SPLITTING---###################

print('\033[1mTESTING THE EFFECT OF VECTORIZING BEFORE AND AFTER SPLITTING DATA:\n\n')

###---CountVectorize betfore splitting data---###  
print('--Vectorizing BEFORE splitting--\033[0m')

X=spam_data['text']
y=spam_data['target']

#-Vectorizing first
vect = CountVectorizer(min_df=5,analyzer='char_wb',ngram_range=(2,5)).fit(X)
X_vectorized = vect.transform(X)

#-Adding features
x_len = X.apply(len)
X_aug = add_feature(X_vectorized, x_len)
x_digit = X.apply(lambda x: len(re.sub('\D','', x)))
X_aug2 = add_feature(X_aug, x_digit)
x_nonw = X.apply(lambda x: len(re.findall(r'\W',x[0])))
X_aug3 = add_feature(X_aug2, x_nonw)

#-Then splitting data
X_train, X_test, y_train, y_test = train_test_split(X_aug3,y,random_state=0)

#-Training model
model = LogisticRegression(C=100,solver='liblinear').fit(X_train, y_train)
    
features=np.array(vect.get_feature_names())
New_features=np.append(features,['length_of_doc', 'digit_count', 'non_word_char_count'])
    
predictions = model.predict(X_test)
AUC_Score=roc_auc_score(y_test, predictions)
sorted_coef_index=model.coef_[0].argsort()
coeff_min=list(New_features[sorted_coef_index[:10]])
coeff_max=list(New_features[np.sort(sorted_coef_index[-10:])[::-1]])

print('The AUC score using CountVectorizer before slitting Data {:3.2f}.\n'.format(AUC_Score))
print('The 10 words with Smallest features coefficients are:\n{}.\n'.format(coeff_min))
print('The 10 words with Largest features coefficients are:\n{}.\n\n'.format(coeff_max))



###---CountVectorize After splitting data---###  
print('\033[1m--Vectorizing AFTER splitting--\n\033[0m')

#-Splitting data first
X_train, X_test, y_train, y_test = train_test_split(spam_data['text'],spam_data['target'], 
                                                    random_state=0)

#-Then vectorizing
vect = CountVectorizer(min_df=5,analyzer='char_wb',ngram_range=(2,5)).fit(X_train)
X_train_vectorized = vect.transform(X_train)
X_test_vectorized = vect.transform(X_test)

#-Adding features   
x_len = X_train.apply(len)
X_train_aug = add_feature(X_train_vectorized, x_len)
x_digit = X_train.apply(lambda x: len(re.sub('\D','', x)))
X_train_aug2 = add_feature(X_train_aug, x_digit)
x_nonw = X_train.apply(lambda x: len(re.findall(r'\W',x[0])))
X_train_aug3 = add_feature(X_train_aug2, x_nonw)
    
x_len2 = X_test.apply(len)
X_test_aug = add_feature(X_test_vectorized, x_len2)
x_digit2 = X_test.apply(lambda x: len(re.sub('\D','', x)))
X_test_aug2 = add_feature(X_test_aug, x_digit2)
x_nonw2 = X_test.apply(lambda x: len(re.findall(r'\W',x[0])))
X_test_aug3 = add_feature(X_test_aug2, x_nonw2)

#-Training model
model = LogisticRegression(C=100,solver='liblinear').fit(X_train_aug3, y_train)
    
features=np.array(vect.get_feature_names())
New_features=np.append(features,['length_of_doc', 'digit_count', 'non_word_char_count'])
    
predictions = model.predict(X_test_aug3)
    
AUC_Score=roc_auc_score(y_test, predictions)
sorted_coef_index=model.coef_[0].argsort()
coeff_min=list(New_features[sorted_coef_index[:10]])
coeff_max=list(New_features[np.sort(sorted_coef_index[-10:])[::-1]])

print('The AUC score using CountVectorizer before slitting Data {:3.2f}.\n'.format(AUC_Score))
print('The 10 words with Smallest features coefficients are:\n{}.\n'.format(coeff_min))
print('The 10 words with Largest features coefficients are:\n{}.\n'.format(coeff_max))

[1mTESTING THE EFFECT OF VECTORIZING BEFORE AND AFTER SPLITTING DATA:


--Vectorizing BEFORE splitting--[0m
The AUC score using CountVectorizer before slitting Data 0.98.

The 10 words with Smallest features coefficients are:
['. ', '..', '? ', ' i', ' y', ' go', ':)', 'he', ' h', ' m'].

The 10 words with Largest features coefficients are:
['digit_count', 'xt', 'ww', 'ne', 'mob', 'ia', 'co', 'ar', ' x', ' ch'].


[1m--Vectorizing AFTER splitting--
[0m
The AUC score using CountVectorizer before slitting Data 0.98.

The 10 words with Smallest features coefficients are:
['. ', '..', '? ', ' i', ' y', ' go', ':)', ' h', 'he', 'go'].

The 10 words with Largest features coefficients are:
['digit_count', 'xt', 'ww', 'ne', 'mob', 'ia', 'co', 'ar', ' x', ' ch'].

