In [29]:
import pandas as pd
messages = pd.read_csv('SMSSpamCollection.txt',sep='\t',names = ['labels','message'])

In [30]:
messages.head()

Unnamed: 0,labels,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


### Data cleaning and preprocessing

In [31]:
import re
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Welcome\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [32]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

porter_stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Welcome\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [33]:
corpus = []
for i in range (0,len(messages)):
    review = re.sub('[^a-zA-z]',' ',messages['message'][i])
    review = review.lower()
    review = review.split()
    review = [ lemmatizer.lemmatize(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [34]:
corpus

['go jurong point crazy available bugis n great world la e buffet cine got amore wat',
 'ok lar joking wif u oni',
 'free entry wkly comp win fa cup final tkts st may text fa receive entry question std txt rate c apply',
 'u dun say early hor u c already say',
 'nah think go usf life around though',
 'freemsg hey darling week word back like fun still tb ok xxx std chgs send rcv',
 'even brother like speak treat like aid patent',
 'per request melle melle oru minnaminunginte nurungu vettam set callertune caller press copy friend callertune',
 'winner valued network customer selected receivea prize reward claim call claim code kl valid hour',
 'mobile month u r entitled update latest colour mobile camera free call mobile update co free',
 'gonna home soon want talk stuff anymore tonight k cried enough today',
 'six chance win cash pound txt csh send cost p day day tsandcs apply reply hl info',
 'urgent week free membership prize jackpot txt word claim c www dbuk net lccltd pobox ldnw rw'

### Create Bag of Words model

In [35]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2500) # for Binary BOW enable binary = True

In [36]:
X = cv.fit_transform(corpus).toarray()
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], shape=(5572, 2500))

#### N-grams

In [37]:
cv.vocabulary_ ## Gives the top most frequent words with their index values

{'go': np.int64(850),
 'point': np.int64(1589),
 'crazy': np.int64(473),
 'available': np.int64(142),
 'bugis': np.int64(284),
 'great': np.int64(874),
 'world': np.int64(2434),
 'la': np.int64(1118),
 'cine': np.int64(388),
 'got': np.int64(865),
 'wat': np.int64(2356),
 'ok': np.int64(1468),
 'lar': np.int64(1126),
 'joking': np.int64(1080),
 'wif': np.int64(2397),
 'oni': np.int64(1476),
 'free': np.int64(784),
 'entry': np.int64(644),
 'wkly': np.int64(2420),
 'comp': np.int64(428),
 'win': np.int64(2401),
 'fa': np.int64(689),
 'cup': np.int64(486),
 'final': np.int64(734),
 'tkts': np.int64(2183),
 'st': np.int64(1996),
 'may': np.int64(1289),
 'text': np.int64(2136),
 'receive': np.int64(1712),
 'question': np.int64(1665),
 'std': np.int64(2011),
 'txt': np.int64(2245),
 'rate': np.int64(1688),
 'apply': np.int64(98),
 'dun': np.int64(603),
 'say': np.int64(1813),
 'early': np.int64(608),
 'already': np.int64(65),
 'nah': np.int64(1402),
 'think': np.int64(2154),
 'usf': np.int6

In [None]:
cv = CountVectorizer(max_features=2500,ngram_range=(3,3)) # BOW with n_gram
## Play with each feature , max_features , n grams ((1,1) then (1,2) then (1,3), then ...) until we get desired accuracy

X = cv.fit_transform(corpus).toarray()
X
cv.vocabulary_

{'free entry wkly': np.int64(752),
 'entry wkly comp': np.int64(609),
 'wkly comp win': np.int64(2386),
 'comp win fa': np.int64(293),
 'win fa cup': np.int64(2334),
 'fa cup final': np.int64(658),
 'cup final tkts': np.int64(378),
 'final tkts st': np.int64(694),
 'st may text': np.int64(1859),
 'fa receive entry': np.int64(659),
 'entry question std': np.int64(604),
 'std txt rate': np.int64(1869),
 'txt rate apply': np.int64(2030),
 'per request melle': np.int64(1552),
 'request melle melle': np.int64(1701),
 'melle melle oru': np.int64(1323),
 'melle oru minnaminunginte': np.int64(1324),
 'oru minnaminunginte nurungu': np.int64(1505),
 'minnaminunginte nurungu vettam': np.int64(1342),
 'nurungu vettam set': np.int64(1445),
 'vettam set callertune': np.int64(2182),
 'set callertune caller': np.int64(1756),
 'callertune caller press': np.int64(127),
 'caller press copy': np.int64(124),
 'press copy friend': np.int64(1645),
 'copy friend callertune': np.int64(340),
 'winner valued net