In [1]:
import pandas as pd

In [2]:
url = "https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv"

df = pd.read_csv(url, sep='\t', names=['label', 'message'])

In [3]:
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df['label'].value_counts()

label
ham     4825
spam     747
Name: count, dtype: int64

In [5]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
ps = PorterStemmer()
corpus = []

for i in range(0, len(df)):

	review = re.sub('[^a-zA-Z]', ' ', df['message'][i])
	review = review.lower()
	review = review.split()
	review = [ps.stem(word) for word in review if word not in stopwords.words('english')]
	review = ' '.join(review)
	corpus.append(review)

In [7]:
corpus[:5]

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat',
 'ok lar joke wif u oni',
 'free entri wkli comp win fa cup final tkt st may text fa receiv entri question std txt rate c appli',
 'u dun say earli hor u c alreadi say',
 'nah think goe usf live around though']

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=2500, binary=True)
X = cv.fit_transform(corpus).toarray()	

In [9]:
X[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [10]:
y = pd.get_dummies(df['label'], drop_first=True, dtype='int')
y.head()

Unnamed: 0,spam
0,0
1,0
2,1
3,0
4,0


In [11]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [12]:
model = LogisticRegression()
model.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [13]:
y_pred = model.predict(X_test)

In [14]:
acc_score = model.score(X_test, y_test)

print(f"Accuracy Score: {acc_score}")
print(f"Classification Report:\n{classification_report(y_test, y_pred)}")
print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}")

Accuracy Score: 0.9865470852017937
Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       955
           1       0.99      0.91      0.95       160

    accuracy                           0.99      1115
   macro avg       0.99      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115

Confusion Matrix:
[[954   1]
 [ 14 146]]


### Word2Vect Implementation

In [15]:
import gensim
import gensim.downloader as api

In [18]:
wv = api.load('glove-wiki-gigaword-300')

In [79]:
wv_king = wv['king']
wv_king

array([ 0.0033901, -0.34614  ,  0.28144  ,  0.48382  ,  0.59469  ,
        0.012965 ,  0.53982  ,  0.48233  ,  0.21463  , -1.0249   ,
       -0.34788  , -0.79001  , -0.15084  ,  0.61374  ,  0.042811 ,
        0.19323  ,  0.25462  ,  0.32528  ,  0.05698  ,  0.063253 ,
       -0.49439  ,  0.47337  , -0.16761  ,  0.045594 ,  0.30451  ,
       -0.35416  , -0.34583  , -0.20118  ,  0.25511  ,  0.091111 ,
        0.014651 , -0.017541 , -0.23854  ,  0.48215  , -0.9145   ,
       -0.36235  ,  0.34736  ,  0.028639 , -0.027065 , -0.036481 ,
       -0.067391 , -0.23452  , -0.13772  ,  0.33951  ,  0.13415  ,
       -0.1342   ,  0.47856  , -0.1842   ,  0.10705  , -0.45834  ,
       -0.36085  , -0.22595  ,  0.32881  , -0.13643  ,  0.23128  ,
        0.34269  ,  0.42344  ,  0.47057  ,  0.479    ,  0.074639 ,
        0.3344   ,  0.10714  , -0.13289  ,  0.58734  ,  0.38616  ,
       -0.52238  , -0.22028  , -0.072322 ,  0.32269  ,  0.44226  ,
       -0.037382 ,  0.18324  ,  0.058082 ,  0.26938  ,  0.3620

In [16]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
corpus = []
for i in range(0, len(df)):
    review = re.sub('[^a-zA-Z]', ' ', df['message'][i])
    review = review.lower()
    review = review.split()
    review = [lemmatizer.lemmatize(word) for word in review] # lemmatization
    review = ' '.join(review)
    corpus.append(review)

In [17]:
corpus[0]

'go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat'

In [18]:
from nltk.tokenize import sent_tokenize
from gensim.utils import simple_preprocess

In [19]:
words = []

for sent in corpus:
    sent_token = sent_tokenize(sent)
    for word in sent_token:
        words.append(simple_preprocess(word)) # lowercase token, ignore too large and short word
  

In [20]:
words[0]

['go',
 'until',
 'jurong',
 'point',
 'crazy',
 'available',
 'only',
 'in',
 'bugis',
 'great',
 'world',
 'la',
 'buffet',
 'cine',
 'there',
 'got',
 'amore',
 'wat']

In [21]:
model = gensim.models.Word2Vec(sentences=words)

In [22]:
model.wv.index_to_key # vocabulary

['to',
 'you',
 'the',
 'it',
 'and',
 'in',
 'is',
 'me',
 'my',
 'for',
 'your',
 'call',
 'of',
 'that',
 'have',
 'on',
 'now',
 'are',
 'can',
 'so',
 'but',
 'not',
 'or',
 'we',
 'do',
 'get',
 'at',
 'ur',
 'will',
 'if',
 'be',
 'with',
 'no',
 'just',
 'this',
 'gt',
 'lt',
 'go',
 'how',
 'up',
 'when',
 'ok',
 'day',
 'what',
 'free',
 'from',
 'all',
 'out',
 'know',
 'll',
 'come',
 'like',
 'good',
 'time',
 'am',
 'then',
 'got',
 'wa',
 'there',
 'he',
 'love',
 'text',
 'only',
 'want',
 'send',
 'one',
 'need',
 'txt',
 'today',
 'by',
 'going',
 'don',
 'stop',
 'home',
 'she',
 'about',
 'lor',
 'sorry',
 'see',
 'still',
 'mobile',
 'take',
 'back',
 'da',
 'reply',
 'dont',
 'our',
 'think',
 'tell',
 'week',
 'hi',
 'phone',
 'they',
 'new',
 'please',
 'later',
 'pls',
 'any',
 'her',
 'ha',
 'co',
 'did',
 'been',
 'msg',
 'min',
 'some',
 'an',
 'night',
 'make',
 'dear',
 'who',
 'here',
 'message',
 'say',
 'well',
 'where',
 're',
 'thing',
 'much',
 'oh',

In [23]:
print(model.corpus_count) # total words in corpus
print(model.epochs) # window size

5569
5


In [24]:
model.wv.similar_by_word('happy')

[('day', 0.9990846514701843),
 ('great', 0.9989356398582458),
 ('hey', 0.9988958239555359),
 ('well', 0.9988702535629272),
 ('many', 0.9988437294960022),
 ('wish', 0.9988350868225098),
 ('dear', 0.9988294243812561),
 ('all', 0.9988254308700562),
 ('nice', 0.9988113641738892),
 ('thing', 0.9987800121307373)]

### AVG Word2vec

In [25]:
import numpy as np

In [26]:
def avg_wv(doc):
    return np.mean([model.wv[word] for word in doc if word in model.wv.index_to_key], axis=0)

In [None]:
from tqdm import tqdm # it creates a progress bar

In [None]:
X = []

for i in tqdm(range(len(words))): 
    X.append(avg_wv(words[i]))

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
100%|██████████| 5569/5569 [00:00<00:00, 16347.12it/s]


In [36]:
words[0]

['go',
 'until',
 'jurong',
 'point',
 'crazy',
 'available',
 'only',
 'in',
 'bugis',
 'great',
 'world',
 'la',
 'buffet',
 'cine',
 'there',
 'got',
 'amore',
 'wat']

In [43]:
print(X[0].shape)
X[0]

(100,)


array([-0.18391152,  0.23180181,  0.12256391,  0.07520144,  0.09520505,
       -0.49501598,  0.17815961,  0.46365026, -0.2504326 , -0.12962274,
       -0.16646124, -0.36546502, -0.05671956,  0.08284429,  0.19456837,
       -0.17300412,  0.10310265, -0.31546947, -0.08730933, -0.51290375,
        0.20021464,  0.08718891,  0.06904654, -0.2047303 , -0.04657201,
       -0.02796936, -0.18880048, -0.19759065, -0.28427643,  0.04063643,
        0.30889264,  0.01605377,  0.10380767, -0.19036251, -0.10148354,
        0.38228852,  0.06855105, -0.11377273, -0.12459046, -0.45392814,
        0.10028882, -0.21765634, -0.18553649, -0.00824959,  0.14978711,
        0.00647108, -0.10125791, -0.02886489,  0.21415435,  0.13741037,
        0.14799315, -0.2018372 , -0.03415288,  0.06064118, -0.04272933,
        0.07599185,  0.1540493 ,  0.00951668, -0.3496187 ,  0.16907373,
        0.01580786,  0.15774168,  0.00718291, -0.10724404, -0.2935957 ,
        0.2516545 ,  0.09547785,  0.21944068, -0.3376528 ,  0.38