In [0]:
%tensorflow_version 2.x
import tensorflow as tf
#from tf.keras.models import Sequential
#from tf.keras.layers import Dense
import os
import io

tf.__version__

TensorFlow 2.x selected.


'2.1.0'

# Download Data

In [0]:
# Download the zip file
path_to_zip = tf.keras.utils.get_file("smsspamcollection.zip",
                  origin="https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip",
                  extract=True)

# Unzip the file into a folder
!unzip $path_to_zip -d data

Downloading data from https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip
Archive:  /root/.keras/datasets/smsspamcollection.zip
  inflating: data/SMSSpamCollection  
  inflating: data/readme             


In [0]:
# optional step - helps if colab gets disconnected
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
# Test data reading
lines = io.open('/content/drive/My Drive/colab-data/SMSSpamCollection').read().strip().split('\n')
lines[0]

'ham\tGo until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

# Pre-Process Data

In [0]:
spam_dataset = []
count = 0
for line in lines:
  label, text = line.split('\t')
  if label.lower().strip() == 'spam':
    spam_dataset.append((1, text.strip()))
    count += 1
  else:
    spam_dataset.append(((0, text.strip())))

print(spam_dataset[0])
print("Spam: ", count)

(0, 'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...')
Spam:  747


# Data Normalization

In [0]:
import pandas as pd 

In [0]:
df = pd.DataFrame(spam_dataset, columns=['Spam', 'Message'])

In [0]:
import re

# Normalization functions

def message_length(x):
  # returns total number of characters
  return len(x)

def num_capitals(x):
  _, count = re.subn(r'[A-Z]', '', x) # only works in english
  return count

def num_punctuation(x):
  _, count = re.subn(r'\W', '', x)
  return count



In [0]:
df['Capitals'] = df['Message'].apply(num_capitals)
df['Punctuation'] = df['Message'].apply(num_punctuation)
df['Length'] = df['Message'].apply(message_length)

In [0]:
df.describe()

Unnamed: 0,Spam,Capitals,Punctuation,Length
count,5574.0,5574.0,5574.0,5574.0
mean,0.134015,5.621636,18.942591,80.443488
std,0.340699,11.683233,14.825994,59.841746
min,0.0,0.0,0.0,2.0
25%,0.0,1.0,8.0,36.0
50%,0.0,2.0,15.0,61.0
75%,0.0,4.0,27.0,122.0
max,1.0,129.0,253.0,910.0


In [0]:
train=df.sample(frac=0.8,random_state=42) #random state is a seed value
test=df.drop(train.index)

In [0]:
train.describe()

Unnamed: 0,Spam,Capitals,Punctuation,Length
count,4459.0,4459.0,4459.0,4459.0
mean,0.132765,5.519399,18.886522,80.316439
std,0.339359,11.405424,14.602023,59.346407
min,0.0,0.0,0.0,2.0
25%,0.0,1.0,8.0,35.0
50%,0.0,2.0,15.0,61.0
75%,0.0,4.0,27.0,122.0
max,1.0,129.0,253.0,910.0


In [0]:
test.describe()

Unnamed: 0,Spam,Capitals,Punctuation,Length,Words
count,1115.0,1115.0,1115.0,1115.0,1115.0
mean,0.139013,6.030493,19.166816,80.95157,19.172197
std,0.346116,12.731059,15.694599,61.807655,14.644223
min,0.0,0.0,0.0,2.0,1.0
25%,0.0,1.0,8.0,36.0,9.0
50%,0.0,2.0,15.0,61.0,15.0
75%,0.0,4.0,28.0,123.0,28.0
max,1.0,127.0,195.0,790.0,196.0


# Model Building

In [0]:
# Basic 1-layer neural network model for evaluation
def make_model(input_dims=3, num_units=12):
  model = tf.keras.Sequential()

  # Adds a densely-connected layer with 12 units to the model:
  model.add(tf.keras.layers.Dense(num_units, 
                                  input_dim=input_dims, 
                                  activation='relu'))

  # Add a sigmoid layer with a binary output unit:
  model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
  model.compile(loss='binary_crossentropy', optimizer='adam', 
                metrics=['accuracy'])
  return model

In [0]:
x_train = train[['Length', 'Punctuation', 'Capitals']]
y_train = train[['Spam']]

x_test = test[['Length', 'Punctuation', 'Capitals']]
y_test = test[['Spam']]

In [0]:
x_train

Unnamed: 0,Length,Punctuation,Capitals
3690,25,4,1
3527,161,48,107
724,40,7,1
3370,69,17,3
468,37,8,1
...,...,...,...
3280,444,114,44
3186,65,14,50
3953,81,23,2
2768,38,8,2


In [0]:
model = make_model()

In [0]:
model.fit(x_train, y_train, epochs=10, batch_size=10)

Train on 4459 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f0dcb47f278>

In [0]:
model.evaluate(x_test, y_test)



[0.24846312586769395, 0.89596415]

In [0]:
y_train_pred = model.predict_classes(x_train)

In [0]:
# confusion matrix
tf.math.confusion_matrix(tf.constant(y_train.Spam), 
                         y_train_pred)

<tf.Tensor: shape=(2, 2), dtype=int32, numpy=
array([[3831,   36],
       [ 367,  225]], dtype=int32)>

In [0]:
sum(y_train_pred)

array([261], dtype=int32)

In [0]:
y_test_pred = model.predict_classes(x_test)
tf.math.confusion_matrix(tf.constant(y_test.Spam), y_test_pred)

<tf.Tensor: shape=(2, 2), dtype=int32, numpy=
array([[950,  10],
       [106,  49]], dtype=int32)>

# Tokenization and Stop Word Removal

In [0]:
sentence = 'Go until jurong point, crazy.. Available only in bugis n great world'
sentence.split()

['Go',
 'until',
 'jurong',
 'point,',
 'crazy..',
 'Available',
 'only',
 'in',
 'bugis',
 'n',
 'great',
 'world']

In [0]:
!pip install stanfordnlp



In [0]:
import stanfordnlp as snlp

In [0]:
en = snlp.download('en') 

Using the default treebank "en_ewt" for language "en".
Would you like to download the models for: en_ewt now? (Y/n)
y

Default download directory: /root/stanfordnlp_resources
Hit enter to continue or type an alternate directory.


Downloading models for: en_ewt
Download location: /root/stanfordnlp_resources/en_ewt_models.zip


100%|██████████| 235M/235M [00:15<00:00, 15.0MB/s]



Download complete.  Models saved to: /root/stanfordnlp_resources/en_ewt_models.zip
Extracting models file for: en_ewt
Cleaning up...Done.


In [0]:
en = snlp.Pipeline(lang='en')

Use device: cpu
---
Loading: tokenize
With settings: 
{'model_path': '/root/stanfordnlp_resources/en_ewt_models/en_ewt_tokenizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: pos
With settings: 
{'model_path': '/root/stanfordnlp_resources/en_ewt_models/en_ewt_tagger.pt', 'pretrain_path': '/root/stanfordnlp_resources/en_ewt_models/en_ewt.pretrain.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: lemma
With settings: 
{'model_path': '/root/stanfordnlp_resources/en_ewt_models/en_ewt_lemmatizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
Building an attentional Seq2Seq model...
Using a Bi-LSTM encoder
Using soft attention for LSTM.
Finetune all embeddings.
[Running seq2seq lemmatizer with edit classifier]
---
Loading: depparse
With settings: 
{'model_path': '/root/stanfordnlp_resources/en_ewt_models/en_ewt_parser.pt', 'pretrain_path': '/root/stanfordnlp_resources/en_ewt_models/en_ewt.pretrain.pt', 'lang': 'en', 'shorthand

In [0]:
tokenized = en(sentence)



In [0]:
len(tokenized.sentences[0].tokens)

7

In [0]:
for snt in tokenized.sentences:
  for word in snt.tokens:
    print(word.text)
  print("<End of Sentence>")

Go
until
jurong
point
,
crazy
..
<End of Sentence>
Available
only
in
bugis
n
great
world
<End of Sentence>


## Dependency Parsing Example

In [0]:
en2 = snlp.Pipeline(lang='en')
pr2 = en2("Hari went to school")
for snt in pr2.sentences:
  for word in snt.tokens:
    print(word)
  print("<End of Sentence>")

Use device: cpu
---
Loading: tokenize
With settings: 
{'model_path': '/root/stanfordnlp_resources/en_ewt_models/en_ewt_tokenizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: pos
With settings: 
{'model_path': '/root/stanfordnlp_resources/en_ewt_models/en_ewt_tagger.pt', 'pretrain_path': '/root/stanfordnlp_resources/en_ewt_models/en_ewt.pretrain.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: lemma
With settings: 
{'model_path': '/root/stanfordnlp_resources/en_ewt_models/en_ewt_lemmatizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
Building an attentional Seq2Seq model...
Using a Bi-LSTM encoder
Using soft attention for LSTM.
Finetune all embeddings.
[Running seq2seq lemmatizer with edit classifier]
---
Loading: depparse
With settings: 
{'model_path': '/root/stanfordnlp_resources/en_ewt_models/en_ewt_parser.pt', 'pretrain_path': '/root/stanfordnlp_resources/en_ewt_models/en_ewt.pretrain.pt', 'lang': 'en', 'shorthand



## Japanese Tokenization Example

In [0]:
jp = snlp.download('ja') 

Using the default treebank "ja_gsd" for language "ja".
Would you like to download the models for: ja_gsd now? (Y/n)
y

Default download directory: /root/stanfordnlp_resources
Hit enter to continue or type an alternate directory.


Downloading models for: ja_gsd
Download location: /root/stanfordnlp_resources/ja_gsd_models.zip


100%|██████████| 219M/219M [01:09<00:00, 3.60MB/s]



Download complete.  Models saved to: /root/stanfordnlp_resources/ja_gsd_models.zip
Extracting models file for: ja_gsd
Cleaning up...Done.


In [0]:
jp = snlp.Pipeline(lang='ja')

Use device: cpu
---
Loading: tokenize
With settings: 
{'model_path': '/root/stanfordnlp_resources/ja_gsd_models/ja_gsd_tokenizer.pt', 'lang': 'ja', 'shorthand': 'ja_gsd', 'mode': 'predict'}
---
Loading: pos
With settings: 
{'model_path': '/root/stanfordnlp_resources/ja_gsd_models/ja_gsd_tagger.pt', 'pretrain_path': '/root/stanfordnlp_resources/ja_gsd_models/ja_gsd.pretrain.pt', 'lang': 'ja', 'shorthand': 'ja_gsd', 'mode': 'predict'}
---
Loading: lemma
With settings: 
{'model_path': '/root/stanfordnlp_resources/ja_gsd_models/ja_gsd_lemmatizer.pt', 'lang': 'ja', 'shorthand': 'ja_gsd', 'mode': 'predict'}
Building an attentional Seq2Seq model...
Using a Bi-LSTM encoder
Using soft attention for LSTM.
Finetune all embeddings.
[Running seq2seq lemmatizer with edit classifier]
---
Loading: depparse
With settings: 
{'model_path': '/root/stanfordnlp_resources/ja_gsd_models/ja_gsd_parser.pt', 'pretrain_path': '/root/stanfordnlp_resources/ja_gsd_models/ja_gsd.pretrain.pt', 'lang': 'ja', 'shorthand

In [0]:
jp_line = jp("選挙管理委員会")



In [0]:
for snt in jp_line.sentences:
  for word in snt.tokens:
    print(word.text)

選挙
管理
委員会


# Adding Word Count Feature 

In [0]:
def word_counts(x, pipeline=en):
  doc = pipeline(x)
  count = sum( [ len(sentence.tokens) for sentence in doc.sentences] )
  return count


In [0]:
#en = snlp.Pipeline(lang='en', processors='tokenize')
df['Words'] = df['Message'].apply(word_counts)

In [0]:
df.describe()

Unnamed: 0,Spam,Capitals,Punctuation,Length
count,5574.0,5574.0,5574.0,5574.0
mean,0.134015,5.621636,18.942591,80.443488
std,0.340699,11.683233,14.825994,59.841746
min,0.0,0.0,0.0,2.0
25%,0.0,1.0,8.0,36.0
50%,0.0,2.0,15.0,61.0
75%,0.0,4.0,27.0,122.0
max,1.0,129.0,253.0,910.0


In [0]:
#train=df.sample(frac=0.8,random_state=42) #random state is a seed value
#test=df.drop(train.index)

train['Words'] = train['Message'].apply(word_counts)
test['Words'] = test['Message'].apply(word_counts)


In [0]:
x_train = train[['Length', 'Punctuation', 'Capitals', 'Words']]
y_train = train[['Spam']]

x_test = test[['Length', 'Punctuation', 'Capitals' , 'Words']]
y_test = test[['Spam']]

model = make_model(input_dims=4)


In [0]:
model.fit(x_train, y_train, epochs=10, batch_size=10)

Train on 4459 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f70dddae9e8>

In [0]:
model.evaluate(x_test, y_test)



[0.1985733597936117, 0.9336323]

## Stop Word Removal

In [0]:
!pip install stopwordsiso



In [0]:
import stopwordsiso as stopwords

stopwords.langs()

In [0]:
sorted(stopwords.stopwords('en'))

In [0]:
en_sw = stopwords.stopwords('en')

def word_counts(x, pipeline=en):
  doc = pipeline(x)
  count = 0
  for sentence in doc.sentences:
    for token in sentence.tokens:
        if token.text.lower() not in en_sw:
          count += 1
  return count

In [0]:
train['Words'] = train['Message'].apply(word_counts)
test['Words'] = test['Message'].apply(word_counts)

In [0]:
x_train = train[['Length', 'Punctuation', 'Capitals', 'Words']]
y_train = train[['Spam']]

x_test = test[['Length', 'Punctuation', 'Capitals' , 'Words']]
y_test = test[['Spam']]

model = make_model(input_dims=4)
#model = make_model(input_dims=3)

model.fit(x_train, y_train, epochs=10, batch_size=10)

Train on 4459 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f0dbc5ed748>

## POS Based Features

In [0]:
en = snlp.Pipeline(lang='en')

txt = "Yo you around? A friend of mine's lookin."
pos = en(txt)

Use device: cpu
---
Loading: tokenize
With settings: 
{'model_path': '/root/stanfordnlp_resources/en_ewt_models/en_ewt_tokenizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: pos
With settings: 
{'model_path': '/root/stanfordnlp_resources/en_ewt_models/en_ewt_tagger.pt', 'pretrain_path': '/root/stanfordnlp_resources/en_ewt_models/en_ewt.pretrain.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: lemma
With settings: 
{'model_path': '/root/stanfordnlp_resources/en_ewt_models/en_ewt_lemmatizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
Building an attentional Seq2Seq model...
Using a Bi-LSTM encoder
Using soft attention for LSTM.
Finetune all embeddings.
[Running seq2seq lemmatizer with edit classifier]
---
Loading: depparse
With settings: 
{'model_path': '/root/stanfordnlp_resources/en_ewt_models/en_ewt_parser.pt', 'pretrain_path': '/root/stanfordnlp_resources/en_ewt_models/en_ewt.pretrain.pt', 'lang': 'en', 'shorthand



In [0]:
def print_pos(doc):
    text = ""
    for sentence in doc.sentences:
        for token in sentence.tokens:
            text += token.words[0].text + "/" + \
                    token.words[0].upos + " "
        text += "\n"
    return text

In [0]:
print(print_pos(pos))

Yo/PRON you/PRON around/ADV ?/PUNCT 
A/DET friend/NOUN of/ADP mine/PRON 's/PART lookin/NOUN ./PUNCT 



In [0]:
en_sw = stopwords.stopwords('en')

def word_counts_v3(x, pipeline=en):
  doc = pipeline(x)
  count = 0
  for sentence in doc.sentences:
    for token in sentence.tokens:
        if token.text.lower() not in en_sw and \
        token.words[0].upos not in ['PUNCT', 'SYM']:
          count += 1
  return count

In [0]:
print(word_counts(txt), word_counts_v3(txt))

6 4




In [0]:
train['Test'] = 0
train.describe()

Unnamed: 0,Spam,Capitals,Punctuation,Length,Words,Test
count,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0
mean,0.132765,5.519399,18.886522,80.316439,9.312178,0.0
std,0.339359,11.405424,14.602023,59.346407,8.019288,0.0
min,0.0,0.0,0.0,2.0,0.0,0.0
25%,0.0,1.0,8.0,35.0,4.0,0.0
50%,0.0,2.0,15.0,61.0,7.0,0.0
75%,0.0,4.0,27.0,122.0,13.0,0.0
max,1.0,129.0,253.0,910.0,147.0,0.0


In [0]:
def word_counts_v3(x, pipeline=en):
  doc = pipeline(x)
  totals = 0.
  count = 0.
  non_word = 0.
  for sentence in doc.sentences:
    totals += len(sentence.tokens)  # (1)
    for token in sentence.tokens:
        if token.text.lower() not in en_sw:
          if token.words[0].upos not in ['PUNCT', 'SYM']:
            count += 1.
          else:
            non_word += 1.
  non_word = non_word / totals
  return pd.Series([count, non_word], index=['Words_NoPunct', 'Punct'])

In [0]:
x = train[:10]
x.describe()

Unnamed: 0,Spam,Capitals,Punctuation,Length,Words,Test
count,10.0,10.0,10.0,10.0,10.0,10.0
mean,0.0,14.4,18.3,72.7,8.6,3.0
std,0.0,32.948445,14.772723,50.36103,10.383748,0.0
min,0.0,1.0,4.0,23.0,2.0,3.0
25%,0.0,1.0,7.25,37.75,3.0,3.0
50%,0.0,1.5,13.0,57.0,4.0,3.0
75%,0.0,9.0,23.75,88.0,10.5,3.0
max,0.0,107.0,48.0,161.0,36.0,3.0


In [0]:
train_tmp = train['Message'].apply(word_counts_v3)
train = pd.concat([train, train_tmp], axis=1)
train.describe()



In [0]:
test_tmp = test['Message'].apply(word_counts_v3)
test = pd.concat([test, test_tmp], axis=1)
test.describe()

In [0]:
z = pd.concat([x, tmp], axis=1)
z.describe()

Unnamed: 0,Spam,Capitals,Punctuation,Length,Words,Test,Words_NoPunct,Punct
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,0.17,6.07,20.2,87.94,10.09,3.0,7.16,0.139329
std,0.377525,12.565188,14.162834,58.070633,8.405259,0.0,6.442786,0.083979
min,0.0,0.0,3.0,17.0,1.0,3.0,0.0,0.0
25%,0.0,1.0,9.0,44.0,4.0,3.0,2.75,0.090909
50%,0.0,2.0,16.0,69.5,7.0,3.0,5.0,0.140873
75%,0.0,5.0,28.0,130.25,13.0,3.0,10.0,0.181818
max,1.0,107.0,76.0,297.0,50.0,3.0,36.0,0.4


In [0]:
z.loc[z['Spam']==0].describe()

Unnamed: 0,Spam,Capitals,Punctuation,Length,Words,Test,Words_NoPunct,Punct
count,83.0,83.0,83.0,83.0,83.0,83.0,83.0,83.0
mean,0.0,4.674699,18.253012,75.891566,8.457831,3.0,5.662651,0.143853
std,0.0,12.864309,14.667737,55.824859,8.011122,0.0,5.77268,0.088803
min,0.0,0.0,3.0,17.0,1.0,3.0,0.0,0.0
25%,0.0,1.0,8.0,38.5,3.5,3.0,2.0,0.096875
50%,0.0,2.0,12.0,55.0,6.0,3.0,4.0,0.142857
75%,0.0,3.0,26.5,107.0,12.0,3.0,7.5,0.2
max,0.0,107.0,76.0,297.0,50.0,3.0,36.0,0.4


In [0]:
z.loc[z['Spam']==1].describe()

Unnamed: 0,Spam,Capitals,Punctuation,Length,Words,Test,Words_NoPunct,Punct
count,17.0,17.0,17.0,17.0,17.0,17.0,17.0,17.0
mean,1.0,12.882353,29.705882,146.764706,18.058824,3.0,14.470588,0.117241
std,0.0,8.328283,4.779706,21.545096,5.189781,0.0,4.199965,0.050962
min,1.0,2.0,17.0,74.0,9.0,3.0,8.0,0.035714
25%,1.0,7.0,27.0,146.0,13.0,3.0,11.0,0.068966
50%,1.0,12.0,30.0,155.0,20.0,3.0,14.0,0.125
75%,1.0,20.0,33.0,157.0,22.0,3.0,17.0,0.15625
max,1.0,31.0,36.0,162.0,26.0,3.0,23.0,0.1875


In [0]:
aa = [word_counts_v3(y) for y in x['Message']]

In [0]:
ab = pd.DataFrame(aa)
ab.describe()

Unnamed: 0,Words_NoPunct,Punct
count,100.0,100.0
mean,7.16,0.139329
std,6.442786,0.083979
min,0.0,0.0
25%,2.75,0.090909
50%,5.0,0.140873
75%,10.0,0.181818
max,36.0,0.4


# Lemmatization

In [0]:

text = "Stemming is aimed at reducing vocabulary and aid un-derstanding of" +\
       " morphological processes. This helps people un-derstand the" +\
       " morphology of words and reduce size of corpus."

lemma = en(text)

In [0]:
lemmas = ""
for sentence in lemma.sentences:
        for token in sentence.tokens:
            lemmas += token.words[0].lemma +"/" + \
                    token.words[0].upos + " "
        lemmas += "\n"

print(lemmas)

# TF-IDF Based Model


In [0]:
# if not installed already
!pip install sklearn

## Count Vectorization

In [0]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)

vectorizer.get_feature_names()

In [0]:
X.toarray()

In [0]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarity(X.toarray())

In [0]:
query = vectorizer.transform(["apple and bananas"])

cosine_similarity(X, query)

## TF-IDF Vectorization

In [0]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer

transformer = TfidfTransformer(smooth_idf=False)
tfidf = transformer.fit_transform(X.toarray())

pd.DataFrame(tfidf.toarray(), 
             columns=vectorizer.get_feature_names())

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

tfidf = TfidfVectorizer(binary=True)
X = tfidf.fit_transform(train['Message']).astype('float32')
X_test = tfidf.transform(test['Message']).astype('float32')

In [0]:
X_test

<1115x7709 sparse matrix of type '<class 'numpy.float32'>'
	with 13364 stored elements in Compressed Sparse Row format>

In [0]:
from keras.utils import np_utils

model2 = make_model(7709)  # to match tf-idf dimensions
lb = LabelEncoder()
y = lb.fit_transform(y_train)
dummy_y_train = np_utils.to_categorical(y)
model2.fit(X.toarray(), y_train, epochs=10, batch_size=10)

  y = column_or_1d(y, warn=True)


Train on 4459 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7ff29bba6668>

In [0]:
model2.evaluate(X_test.toarray(), y_test)



[0.0515256610174868, 0.9838565]

In [0]:
train.loc[train.Spam == 1].describe() 

Unnamed: 0,Spam,Capitals,Punctuation,Length,Words
count,592.0,592.0,592.0,592.0,592.0
mean,1.0,15.320946,29.086149,138.856419,29.511824
std,0.0,11.635105,7.083572,28.07998,7.474256
min,1.0,0.0,2.0,13.0,3.0
25%,1.0,7.0,26.0,132.0,26.0
50%,1.0,14.0,30.0,149.0,30.0
75%,1.0,21.0,34.0,157.0,35.0
max,1.0,128.0,49.0,197.0,49.0


# Word Vectors

In [0]:
!pip install gensim

In [0]:
from gensim.models.word2vec import Word2Vec
import gensim.downloader as api
model_w2v = api.load("word2vec-google-news-300")

In [0]:
model_w2v.most_similar("cookies",topn=10)

In [0]:
model_w2v.doesnt_match(["USA","Canada","India","Tokyo"])  

In [0]:
king = model_w2v['king']
man = model_w2v['man']
woman = model_w2v['woman']

queen = king - man + woman  
model_w2v.similar_by_vector(queen)