# SMS Spam Classification using Bag of Words and Random Forests

### Loading our training data

In [1]:
import numpy as np
import pickle

train_sentences = pickle.load(open('./data/train_sentences.pkl', 'rb'))
train_labels = pickle.load(open('./data/train_labels.pkl', 'rb'))

print(train_sentences[:10])
print(train_labels[:10])
print(len(train_sentences))
print(len(train_labels))

['Probably not still going over some stuff here', 'I HAVE A DATE ON SUNDAY WITH WILL', 'Thanks 4 your continued support Your question this week will enter u in2 our draw 4 Â£100 cash Name the NEW US President txt ans to 80082', 'Dear 0776xxxxxxx Uve been invited to XCHAT This is our final attempt to contact u Txt CHAT to 86688 150pMsgrcvdHGSuite3422LandsRowW1J6HL LDN 18yrs', 'I sent my scores to sophas and i had to do secondary application for a few schools I think if you are thinking of applying do a research on cost also Contact joke ogunrinde her school is one me the less expensive ones', 'Kothi print out marandratha', 'Arun can u transfr me d amt', 'I asked you to call him now ok', 'Ringtone Club Gr8 new polys direct to your mobile every week', 'Hello Just got here st andrewsboy its a long way Its cold I will keep you posted']
[0 0 1 1 0 0 0 0 1 0]
3734
3734


### Preprocessing our data

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
vectorizer.fit(train_sentences)
train_data = vectorizer.transform(train_sentences)
print(train_data.shape)

(3734, 7701)


### Training our model

In [3]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(train_data, train_labels)

RandomForestClassifier()

### Evaluating our model on the training set

In [4]:
print(np.mean(model.predict(train_data) == train_labels))

1.0


### Splitting between train and validation

In [5]:
split_percentage = .85
split_threshold = int(split_percentage * len(train_labels))
val_sentences = train_sentences[split_threshold:]
train_sentences = train_sentences[:split_threshold]
val_labels = train_labels[split_threshold:]
train_labels = train_labels[:split_threshold]

### Redoing the preprocessing using only the vocabulary of the training samples

In [6]:
vectorizer = CountVectorizer()
vectorizer.fit(train_sentences)
train_data = vectorizer.transform(train_sentences)
val_data = vectorizer.transform(val_sentences)

### Retraining our model and validating it

In [7]:
model = RandomForestClassifier()
model.fit(train_data, train_labels)
print(np.mean(model.predict(train_data) == train_labels))
print(np.mean(model.predict(val_data) == val_labels))

1.0
0.9518716577540107


### Submitting our model

In [8]:
test_sentences = pickle.load(open('./data/test_sentences.pkl', 'rb'))
test_labels = pickle.load(open('./data/test_labels.pkl', 'rb'))

test_data = vectorizer.transform(test_sentences)

print(np.mean(model.predict(test_data) == test_labels))

0.9690217391304348
