In [1]:
import pandas as pd
import numpy as np

In [5]:
data = pd.read_csv('SMSSpamCollection.txt', sep='\t', names = ['label', 'text'])

In [23]:
data.head(10)

Unnamed: 0,label,text,is_spam
0,ham,"Go until jurong point, crazy.. Available only ...",False
1,ham,Ok lar... Joking wif u oni...,False
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,True
3,ham,U dun say so early hor... U c already then say...,False
4,ham,"Nah I don't think he goes to usf, he lives aro...",False
5,spam,FreeMsg Hey there darling it's been 3 week's n...,True
6,ham,Even my brother is not like to speak with me. ...,False
7,ham,As per your request 'Melle Melle (Oru Minnamin...,False
8,spam,WINNER!! As a valued network customer you have...,True
9,spam,Had your mobile 11 months or more? U R entitle...,True


In [13]:
data.groupby(['label']).count()

Unnamed: 0_level_0,text
label,Unnamed: 1_level_1
ham,4825
spam,747


In [41]:
from sklearn.model_selection import train_test_split

data['is_spam'] = data.label.apply(lambda x: x=='spam')

X = data.text.values
y = data.is_spam.values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [42]:
X

array(['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
       'Ok lar... Joking wif u oni...',
       "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
       ..., 'Pity, * was in mood for that. So...any other suggestions?',
       "The guy did some bitching but I acted like i'd be interested in buying something else next week and he gave it to us for free",
       'Rofl. Its true to its name'], dtype=object)

In [44]:
y

array([False, False,  True, ..., False, False, False])

# Baseline

In [86]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import StandardScaler, Normalizer

def create_baseline_pipeline():
    steps = [
        ('bow', CountVectorizer()),
        ('classifier', MultinomialNB()),
    ]
    
    return Pipeline(steps)

In [73]:
base_model = create_baseline_pipeline()
base_model.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('bow', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('classifier', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [74]:
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score, classification_report

def print_scores(y_pred, y_test):
    score_recall = recall_score(y_test, y_pred)
    score_accuracy = accuracy_score(y_test, y_pred)
    score_precision = precision_score(y_test, y_pred)
    score_f1 = f1_score(y_test, y_pred)

    print(f"Precision score = {score_precision}")
    print(f" Accuracy score = {score_accuracy}")
    print(f"   Recall score = {score_recall}")
    print(f"       F1 score = {score_f1}")

In [79]:
y_pred = base_model.predict(X_test)
"""
print("Scores for SPAM:")
print_scores(y_pred, y_test)
print("Scores for HAM:")
print_scores([not y for y in y_pred], [not y for y in y_test])
"""
print(classification_report(y_test, y_pred, digits=4, target_names=['HAM', 'SPAM']))

              precision    recall  f1-score   support

         HAM     0.9925    0.9962    0.9944      1593
        SPAM     0.9750    0.9512    0.9630       246

   micro avg     0.9902    0.9902    0.9902      1839
   macro avg     0.9837    0.9737    0.9787      1839
weighted avg     0.9902    0.9902    0.9902      1839



In [77]:
# Mistakes
print(f"Predict is_spam: \treal is_spam: \tmessage:")
for y_p, y_t, text in zip(y_pred, y_test, X_test):
    if y_p != y_t:
        print(f"{y_p}, \t{y_t}, \t{text}")

Predict is_spam: 	real is_spam: 	message:
False, 	True, 	Oh my god! I've found your number again! I'm so glad, text me back xafter this msgs cst std ntwk chg £1.50
False, 	True, 	Your next amazing xxx PICSFREE1 video will be sent to you enjoy! If one vid is not enough for 2day text back the keyword PICSFREE1 to get the next video.
False, 	True, 	Babe: U want me dont u baby! Im nasty and have a thing 4 filthyguys. Fancy a rude time with a sexy bitch. How about we go slo n hard! Txt XXX SLO(4msgs)
False, 	True, 	Hello darling how are you today? I would love to have a chat, why dont you tell me what you look like and what you are in to sexy?
False, 	True, 	Do you realize that in about 40 years, we'll have thousands of old ladies running around with tattoos?
False, 	True, 	Talk sexy!! Make new friends or fall in love in the worlds most discreet text dating service. Just text VIP to 83110 and see who you could meet.
False, 	True, 	Sorry I missed your call let's talk when you have the time. 

In [83]:
xx = np.array([[0, 0, 0],
               [1, 2, 3],
               [2, 4, 6]], dtype=float)
xx

array([[0., 0., 0.],
       [1., 2., 3.],
       [2., 4., 6.]])

In [84]:
sc = StandardScaler()
sc.fit(xx)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [85]:
sc.transform(xx)

array([[-1.22474487, -1.22474487, -1.22474487],
       [ 0.        ,  0.        ,  0.        ],
       [ 1.22474487,  1.22474487,  1.22474487]])

In [89]:
sc_n = Normalizer()
sc_n.fit(xx)
sc_n.transform(xx)

array([[0.        , 0.        , 0.        ],
       [0.26726124, 0.53452248, 0.80178373],
       [0.26726124, 0.53452248, 0.80178373]])

In [93]:
from sklearn.preprocessing import normalize

normalize(xx, axis=0)

array([[0.        , 0.        , 0.        ],
       [0.4472136 , 0.4472136 , 0.4472136 ],
       [0.89442719, 0.89442719, 0.89442719]])

In [95]:
def normalize_matrix(v):
    norm = np.linalg.norm(v)
    if norm == 0: 
       return v
    return v / norm

In [96]:
normalize_matrix(xx)

array([[0.        , 0.        , 0.        ],
       [0.11952286, 0.23904572, 0.35856858],
       [0.23904572, 0.47809144, 0.71713717]])