In [1]:
%tensorflow_version 2.x
import tensorflow as tf
#from tf.keras.models import Sequential
#from tf.keras.layers import Dense
import os
import io

tf.__version__

'2.3.0'

# Download Data

In [2]:
# Download the zip file
path_to_zip = tf.keras.utils.get_file("smsspamcollection.zip",
                  origin="https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip",
                  extract=True)

# Unzip the file into a folder
!unzip $path_to_zip -d data

Downloading data from https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip
Archive:  /root/.keras/datasets/smsspamcollection.zip
  inflating: data/SMSSpamCollection  
  inflating: data/readme             


In [3]:
# optional step - helps if colab gets disconnected
# from google.colab import drive
# drive.mount('/content/drive')

In [4]:
# Test data reading
# lines = io.open('/content/drive/My Drive/colab-data/SMSSpamCollection').read().strip().split('\n')
lines = io.open('/content/data/SMSSpamCollection').read().strip().split('\n')
lines[0]

'ham\tGo until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

# Pre-Process Data

In [5]:
spam_dataset = []
count = 0
for line in lines:
  label, text = line.split('\t')
  if label.lower().strip() == 'spam':
    spam_dataset.append((1, text.strip()))
    count += 1
  else:
    spam_dataset.append(((0, text.strip())))

print(spam_dataset[0])
print("Spam: ", count)

(0, 'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...')
Spam:  747


# Data Normalization

In [6]:
import pandas as pd 

In [7]:
df = pd.DataFrame(spam_dataset, columns=['Spam', 'Message'])

In [8]:
import re

# Normalization functions

def message_length(x):
  # returns total number of characters
  return len(x)

def num_capitals(x):
  _, count = re.subn(r'[A-Z]', '', x) # only works in english
  return count

def num_punctuation(x):
  _, count = re.subn(r'\W', '', x)
  return count



In [9]:
df['Capitals'] = df['Message'].apply(num_capitals)
df['Punctuation'] = df['Message'].apply(num_punctuation)
df['Length'] = df['Message'].apply(message_length)

In [10]:
df.describe()

Unnamed: 0,Spam,Capitals,Punctuation,Length
count,5574.0,5574.0,5574.0,5574.0
mean,0.134015,5.621636,18.942591,80.443488
std,0.340699,11.683233,14.825994,59.841746
min,0.0,0.0,0.0,2.0
25%,0.0,1.0,8.0,36.0
50%,0.0,2.0,15.0,61.0
75%,0.0,4.0,27.0,122.0
max,1.0,129.0,253.0,910.0


In [11]:
train=df.sample(frac=0.8,random_state=42) #random state is a seed value
test=df.drop(train.index)

In [12]:
train.describe()

Unnamed: 0,Spam,Capitals,Punctuation,Length
count,4459.0,4459.0,4459.0,4459.0
mean,0.132765,5.519399,18.886522,80.316439
std,0.339359,11.405424,14.602023,59.346407
min,0.0,0.0,0.0,2.0
25%,0.0,1.0,8.0,35.0
50%,0.0,2.0,15.0,61.0
75%,0.0,4.0,27.0,122.0
max,1.0,129.0,253.0,910.0


In [13]:
test.describe()

Unnamed: 0,Spam,Capitals,Punctuation,Length
count,1115.0,1115.0,1115.0,1115.0
mean,0.139013,6.030493,19.166816,80.95157
std,0.346116,12.731059,15.694599,61.807655
min,0.0,0.0,0.0,2.0
25%,0.0,1.0,8.0,36.0
50%,0.0,2.0,15.0,61.0
75%,0.0,4.0,28.0,123.0
max,1.0,127.0,195.0,790.0


# Model Building

In [14]:
# Basic 1-layer neural network model for evaluation
def make_model(input_dims=3, num_units=12):
  model = tf.keras.Sequential()

  # Adds a densely-connected layer with 12 units to the model:
  model.add(tf.keras.layers.Dense(num_units, 
                                  input_dim=input_dims, 
                                  activation='relu'))

  # Add a sigmoid layer with a binary output unit:
  model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
  model.compile(loss='binary_crossentropy', optimizer='adam', 
                metrics=['accuracy'])
  return model

In [15]:
x_train = train[['Length', 'Punctuation', 'Capitals']]
y_train = train[['Spam']]

x_test = test[['Length', 'Punctuation', 'Capitals']]
y_test = test[['Spam']]

In [16]:
x_train

Unnamed: 0,Length,Punctuation,Capitals
3690,25,4,1
3527,161,48,107
724,40,7,1
3370,69,17,3
468,37,8,1
...,...,...,...
3280,444,114,44
3186,65,14,50
3953,81,23,2
2768,38,8,2


In [17]:
model = make_model()

In [18]:
model.fit(x_train, y_train, epochs=10, batch_size=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f38f97857f0>

In [19]:
model.evaluate(x_test, y_test)



[0.26615792512893677, 0.8914798498153687]

In [20]:
y_train_pred = model.predict_classes(x_train)

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).


In [21]:
# confusion matrix
tf.math.confusion_matrix(tf.constant(y_train.Spam), 
                         y_train_pred)

<tf.Tensor: shape=(2, 2), dtype=int32, numpy=
array([[3723,  144],
       [ 290,  302]], dtype=int32)>

In [22]:
sum(y_train_pred)

array([446], dtype=int32)

In [23]:
y_test_pred = model.predict_classes(x_test)
tf.math.confusion_matrix(tf.constant(y_test.Spam), y_test_pred)

<tf.Tensor: shape=(2, 2), dtype=int32, numpy=
array([[923,  37],
       [ 84,  71]], dtype=int32)>

# Tokenization and Stop Word Removal

In [24]:
sentence = 'Go until jurong point, crazy.. Available only in bugis n great world'
sentence.split()

['Go',
 'until',
 'jurong',
 'point,',
 'crazy..',
 'Available',
 'only',
 'in',
 'bugis',
 'n',
 'great',
 'world']

In [28]:
!pip install stanza  # StanfordNLP has become https://github.com/stanfordnlp/stanza/

Collecting stanza
[?25l  Downloading https://files.pythonhosted.org/packages/e7/8b/3a9e7a8d8cb14ad6afffc3983b7a7322a3a24d94ebc978a70746fcffc085/stanza-1.1.1-py3-none-any.whl (227kB)
[K     |█▍                              | 10kB 9.1MB/s eta 0:00:01[K     |██▉                             | 20kB 1.7MB/s eta 0:00:01[K     |████▎                           | 30kB 2.2MB/s eta 0:00:01[K     |█████▊                          | 40kB 2.5MB/s eta 0:00:01[K     |███████▏                        | 51kB 2.0MB/s eta 0:00:01[K     |████████▋                       | 61kB 2.3MB/s eta 0:00:01[K     |██████████                      | 71kB 2.5MB/s eta 0:00:01[K     |███████████▌                    | 81kB 2.7MB/s eta 0:00:01[K     |█████████████                   | 92kB 2.9MB/s eta 0:00:01[K     |██████████████▍                 | 102kB 2.8MB/s eta 0:00:01[K     |███████████████▉                | 112kB 2.8MB/s eta 0:00:01[K     |█████████████████▎              | 122kB 2.8MB/s eta 0:00:

In [29]:
import stanza

In [30]:
en = stanza.download('en') 

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.1.0.json: 122kB [00:00, 8.26MB/s]                    
2020-10-14 04:13:38 INFO: Downloading default packages for language: en (English)...
Downloading http://nlp.stanford.edu/software/stanza/1.1.0/en/default.zip: 100%|██████████| 428M/428M [06:14<00:00, 1.14MB/s]
2020-10-14 04:20:01 INFO: Finished downloading models and saved to /root/stanza_resources.


In [31]:
en = stanza.Pipeline(lang='en')

2020-10-14 04:20:02 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | ewt       |
| pos       | ewt       |
| lemma     | ewt       |
| depparse  | ewt       |
| sentiment | sstplus   |
| ner       | ontonotes |

2020-10-14 04:20:02 INFO: Use device: gpu
2020-10-14 04:20:02 INFO: Loading: tokenize
2020-10-14 04:20:12 INFO: Loading: pos
2020-10-14 04:20:13 INFO: Loading: lemma
2020-10-14 04:20:13 INFO: Loading: depparse
2020-10-14 04:20:14 INFO: Loading: sentiment
2020-10-14 04:20:15 INFO: Loading: ner
2020-10-14 04:20:16 INFO: Done loading processors!


In [32]:
sentence

'Go until jurong point, crazy.. Available only in bugis n great world'

In [33]:
tokenized = en(sentence)

In [41]:
len(tokenized.sentences)

2

In [35]:
for snt in tokenized.sentences:
  for word in snt.tokens:
    print(word.text)
  print("<End of Sentence>")

Go
until
jurong
point
,
crazy
..
<End of Sentence>
Available
only
in
bugis
n
great
world
<End of Sentence>


## Dependency Parsing Example

In [36]:
en2 = stanza.Pipeline(lang='en')
pr2 = en2("Hari went to school")
for snt in pr2.sentences:
  for word in snt.tokens:
    print(word)
  print("<End of Sentence>")

2020-10-14 04:20:48 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | ewt       |
| pos       | ewt       |
| lemma     | ewt       |
| depparse  | ewt       |
| sentiment | sstplus   |
| ner       | ontonotes |

2020-10-14 04:20:48 INFO: Use device: gpu
2020-10-14 04:20:48 INFO: Loading: tokenize
2020-10-14 04:20:48 INFO: Loading: pos
2020-10-14 04:20:49 INFO: Loading: lemma
2020-10-14 04:20:49 INFO: Loading: depparse
2020-10-14 04:20:50 INFO: Loading: sentiment
2020-10-14 04:20:51 INFO: Loading: ner
2020-10-14 04:20:52 INFO: Done loading processors!


[
  {
    "id": 1,
    "text": "Hari",
    "lemma": "Hari",
    "upos": "PROPN",
    "xpos": "NNP",
    "feats": "Number=Sing",
    "head": 2,
    "deprel": "nsubj",
    "misc": "start_char=0|end_char=4",
    "ner": "S-PERSON"
  }
]
[
  {
    "id": 2,
    "text": "went",
    "lemma": "go",
    "upos": "VERB",
    "xpos": "VBD",
    "feats": "Mood=Ind|Tense=Past|VerbForm=Fin",
    "head": 0,
    "deprel": "root",
    "misc": "start_char=5|end_char=9",
    "ner": "O"
  }
]
[
  {
    "id": 3,
    "text": "to",
    "lemma": "to",
    "upos": "ADP",
    "xpos": "IN",
    "head": 4,
    "deprel": "case",
    "misc": "start_char=10|end_char=12",
    "ner": "O"
  }
]
[
  {
    "id": 4,
    "text": "school",
    "lemma": "school",
    "upos": "NOUN",
    "xpos": "NN",
    "feats": "Number=Sing",
    "head": 2,
    "deprel": "obl",
    "misc": "start_char=13|end_char=19",
    "ner": "O"
  }
]
<End of Sentence>


## Japanese Tokenization Example

In [37]:
jp = stanza.download('ja') 

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.1.0.json: 122kB [00:00, 10.2MB/s]                    
2020-10-14 04:21:10 INFO: Downloading default packages for language: ja (Japanese)...
Downloading http://nlp.stanford.edu/software/stanza/1.1.0/ja/default.zip: 100%|██████████| 220M/220M [05:35<00:00, 656kB/s] 
2020-10-14 04:26:50 INFO: Finished downloading models and saved to /root/stanza_resources.


In [38]:
jp = stanza.Pipeline(lang='ja')

2020-10-14 04:26:50 INFO: Loading these models for language: ja (Japanese):
| Processor | Package |
-----------------------
| tokenize  | gsd     |
| pos       | gsd     |
| lemma     | gsd     |
| depparse  | gsd     |

2020-10-14 04:26:50 INFO: Use device: gpu
2020-10-14 04:26:50 INFO: Loading: tokenize
2020-10-14 04:26:50 INFO: Loading: pos
2020-10-14 04:26:51 INFO: Loading: lemma
2020-10-14 04:26:51 INFO: Loading: depparse
2020-10-14 04:26:52 INFO: Done loading processors!


In [39]:
jp_line = jp("選挙管理委員会")

In [40]:
for snt in jp_line.sentences:
  for word in snt.tokens:
    print(word.text)

選挙
管理
委員会


# Adding Word Count Feature 

In [42]:
def word_counts(x, pipeline=en):
  doc = pipeline(x)
  count = sum( [ len(sentence.tokens) for sentence in doc.sentences] )
  return count


In [43]:
#en = snlp.Pipeline(lang='en', processors='tokenize')
df['Words'] = df['Message'].apply(word_counts)

In [44]:
df.describe()

Unnamed: 0,Spam,Capitals,Punctuation,Length,Words
count,5574.0,5574.0,5574.0,5574.0,5574.0
mean,0.134015,5.621636,18.942591,80.443488,19.03319
std,0.340699,11.683233,14.825994,59.841746,13.96163
min,0.0,0.0,0.0,2.0,1.0
25%,0.0,1.0,8.0,36.0,9.0
50%,0.0,2.0,15.0,61.0,15.0
75%,0.0,4.0,27.0,122.0,28.0
max,1.0,129.0,253.0,910.0,209.0


In [45]:
#train=df.sample(frac=0.8,random_state=42) #random state is a seed value
#test=df.drop(train.index)

train['Words'] = train['Message'].apply(word_counts)
test['Words'] = test['Message'].apply(word_counts)


In [46]:
x_train = train[['Length', 'Punctuation', 'Capitals', 'Words']]
y_train = train[['Spam']]

x_test = test[['Length', 'Punctuation', 'Capitals' , 'Words']]
y_test = test[['Spam']]

model = make_model(input_dims=4)


In [47]:
model.fit(x_train, y_train, epochs=10, batch_size=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f3856f2fc50>

In [48]:
model.evaluate(x_test, y_test)



[0.264749675989151, 0.8968609571456909]

## Stop Word Removal

In [49]:
!pip install stopwordsiso

Collecting stopwordsiso
[?25l  Downloading https://files.pythonhosted.org/packages/3e/03/4c5f24b654bb9459f81aa5c1b60b094b804286b99dca9f2e116c9eb01ac8/stopwordsiso-0.6.1-py3-none-any.whl (73kB)
[K     |████▌                           | 10kB 19.4MB/s eta 0:00:01[K     |█████████                       | 20kB 1.7MB/s eta 0:00:01[K     |█████████████▍                  | 30kB 2.1MB/s eta 0:00:01[K     |█████████████████▉              | 40kB 2.4MB/s eta 0:00:01[K     |██████████████████████▎         | 51kB 2.0MB/s eta 0:00:01[K     |██████████████████████████▊     | 61kB 2.3MB/s eta 0:00:01[K     |███████████████████████████████▏| 71kB 2.5MB/s eta 0:00:01[K     |████████████████████████████████| 81kB 2.3MB/s 
[?25hInstalling collected packages: stopwordsiso
Successfully installed stopwordsiso-0.6.1


In [50]:
import stopwordsiso as stopwords

stopwords.langs()

{'af',
 'ar',
 'bg',
 'bn',
 'br',
 'ca',
 'cs',
 'da',
 'de',
 'el',
 'en',
 'eo',
 'es',
 'et',
 'eu',
 'fa',
 'fi',
 'fr',
 'ga',
 'gl',
 'gu',
 'ha',
 'he',
 'hi',
 'hr',
 'hu',
 'hy',
 'id',
 'it',
 'ja',
 'ko',
 'ku',
 'la',
 'lt',
 'lv',
 'mr',
 'ms',
 'nl',
 'no',
 'pl',
 'pt',
 'ro',
 'ru',
 'sk',
 'sl',
 'so',
 'st',
 'sv',
 'sw',
 'th',
 'tl',
 'tr',
 'uk',
 'ur',
 'vi',
 'yo',
 'zh',
 'zu'}

In [51]:
sorted(stopwords.stopwords('en'))

["'ll",
 "'tis",
 "'twas",
 "'ve",
 '10',
 '39',
 'a',
 "a's",
 'able',
 'ableabout',
 'about',
 'above',
 'abroad',
 'abst',
 'accordance',
 'according',
 'accordingly',
 'across',
 'act',
 'actually',
 'ad',
 'added',
 'adj',
 'adopted',
 'ae',
 'af',
 'affected',
 'affecting',
 'affects',
 'after',
 'afterwards',
 'ag',
 'again',
 'against',
 'ago',
 'ah',
 'ahead',
 'ai',
 "ain't",
 'aint',
 'al',
 'all',
 'allow',
 'allows',
 'almost',
 'alone',
 'along',
 'alongside',
 'already',
 'also',
 'although',
 'always',
 'am',
 'amid',
 'amidst',
 'among',
 'amongst',
 'amoungst',
 'amount',
 'an',
 'and',
 'announce',
 'another',
 'any',
 'anybody',
 'anyhow',
 'anymore',
 'anyone',
 'anything',
 'anyway',
 'anyways',
 'anywhere',
 'ao',
 'apart',
 'apparently',
 'appear',
 'appreciate',
 'appropriate',
 'approximately',
 'aq',
 'ar',
 'are',
 'area',
 'areas',
 'aren',
 "aren't",
 'arent',
 'arise',
 'around',
 'arpa',
 'as',
 'aside',
 'ask',
 'asked',
 'asking',
 'asks',
 'associated

In [52]:
en_sw = stopwords.stopwords('en')

def word_counts(x, pipeline=en):
  doc = pipeline(x)
  count = 0
  for sentence in doc.sentences:
    for token in sentence.tokens:
        if token.text.lower() not in en_sw:
          count += 1
  return count

In [53]:
train['Words'] = train['Message'].apply(word_counts)
test['Words'] = test['Message'].apply(word_counts)

In [54]:
x_train = train[['Length', 'Punctuation', 'Capitals', 'Words']]
y_train = train[['Spam']]

x_test = test[['Length', 'Punctuation', 'Capitals' , 'Words']]
y_test = test[['Spam']]

model = make_model(input_dims=4)
#model = make_model(input_dims=3)

model.fit(x_train, y_train, epochs=10, batch_size=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f3832aa7d30>

## POS Based Features

In [55]:
en = stanza.Pipeline(lang='en')

txt = "Yo you around? A friend of mine's lookin."
pos = en(txt)

2020-10-14 04:51:48 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | ewt       |
| pos       | ewt       |
| lemma     | ewt       |
| depparse  | ewt       |
| sentiment | sstplus   |
| ner       | ontonotes |

2020-10-14 04:51:48 INFO: Use device: gpu
2020-10-14 04:51:48 INFO: Loading: tokenize
2020-10-14 04:51:48 INFO: Loading: pos
2020-10-14 04:51:49 INFO: Loading: lemma
2020-10-14 04:51:49 INFO: Loading: depparse
2020-10-14 04:51:50 INFO: Loading: sentiment
2020-10-14 04:51:51 INFO: Loading: ner
2020-10-14 04:51:51 INFO: Done loading processors!


In [56]:
def print_pos(doc):
    text = ""
    for sentence in doc.sentences:
        for token in sentence.tokens:
            text += token.words[0].text + "/" + \
                    token.words[0].upos + " "
        text += "\n"
    return text

In [57]:
print(print_pos(pos))

Yo/PRON you/PRON around/ADV ?/PUNCT 
A/DET friend/NOUN of/ADP mine/PRON 's/PART lookin/NOUN ./PUNCT 



In [58]:
en_sw = stopwords.stopwords('en')

def word_counts_v3(x, pipeline=en):
  doc = pipeline(x)
  count = 0
  for sentence in doc.sentences:
    for token in sentence.tokens:
        if token.text.lower() not in en_sw and \
        token.words[0].upos not in ['PUNCT', 'SYM']:
          count += 1
  return count

In [59]:
print(word_counts(txt), word_counts_v3(txt))

6 4


In [60]:
train['Test'] = 0
train.describe()

Unnamed: 0,Spam,Capitals,Punctuation,Length,Words,Test
count,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0
mean,0.132765,5.519399,18.886522,80.316439,9.326979,0.0
std,0.339359,11.405424,14.602023,59.346407,8.016488,0.0
min,0.0,0.0,0.0,2.0,0.0,0.0
25%,0.0,1.0,8.0,35.0,4.0,0.0
50%,0.0,2.0,15.0,61.0,7.0,0.0
75%,0.0,4.0,27.0,122.0,13.0,0.0
max,1.0,129.0,253.0,910.0,147.0,0.0


In [61]:
def word_counts_v3(x, pipeline=en):
  doc = pipeline(x)
  totals = 0.
  count = 0.
  non_word = 0.
  for sentence in doc.sentences:
    totals += len(sentence.tokens)  # (1)
    for token in sentence.tokens:
        if token.text.lower() not in en_sw:
          if token.words[0].upos not in ['PUNCT', 'SYM']:
            count += 1.
          else:
            non_word += 1.
  non_word = non_word / totals
  return pd.Series([count, non_word], index=['Words_NoPunct', 'Punct'])

In [62]:
x = train[:10]
x.describe()

Unnamed: 0,Spam,Capitals,Punctuation,Length,Words,Test
count,10.0,10.0,10.0,10.0,10.0,10.0
mean,0.0,14.4,18.3,72.7,8.6,0.0
std,0.0,32.948445,14.772723,50.36103,10.068653,0.0
min,0.0,1.0,4.0,23.0,2.0,0.0
25%,0.0,1.0,7.25,37.75,3.0,0.0
50%,0.0,1.5,13.0,57.0,4.0,0.0
75%,0.0,9.0,23.75,88.0,10.75,0.0
max,0.0,107.0,48.0,161.0,35.0,0.0


In [63]:
train_tmp = train['Message'].apply(word_counts_v3)
train = pd.concat([train, train_tmp], axis=1)
train.describe()

Unnamed: 0,Spam,Capitals,Punctuation,Length,Words,Test,Words_NoPunct,Punct
count,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0
mean,0.132765,5.519399,18.886522,80.316439,9.326979,0.0,6.535995,0.147763
std,0.339359,11.405424,14.602023,59.346407,8.016488,0.0,5.679984,0.094337
min,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0
25%,0.0,1.0,8.0,35.0,4.0,0.0,3.0,0.090909
50%,0.0,2.0,15.0,61.0,7.0,0.0,5.0,0.142857
75%,0.0,4.0,27.0,122.0,13.0,0.0,9.0,0.2
max,1.0,129.0,253.0,910.0,147.0,0.0,54.0,0.666667


In [64]:
test_tmp = test['Message'].apply(word_counts_v3)
test = pd.concat([test, test_tmp], axis=1)
test.describe()

Unnamed: 0,Spam,Capitals,Punctuation,Length,Words,Words_NoPunct,Punct
count,1115.0,1115.0,1115.0,1115.0,1115.0,1115.0,1115.0
mean,0.139013,6.030493,19.166816,80.95157,9.623318,6.700448,0.152936
std,0.346116,12.731059,15.694599,61.807655,8.303803,5.887786,0.101909
min,0.0,0.0,0.0,2.0,0.0,0.0,0.0
25%,0.0,1.0,8.0,36.0,4.0,3.0,0.096774
50%,0.0,2.0,15.0,61.0,7.0,4.0,0.142857
75%,0.0,4.0,28.0,123.0,14.0,10.0,0.2
max,1.0,127.0,195.0,790.0,83.0,45.0,1.0


In [66]:
z = pd.concat([x, train_tmp], axis=1)
z.describe()

Unnamed: 0,Spam,Capitals,Punctuation,Length,Words,Test,Words_NoPunct,Punct
count,10.0,10.0,10.0,10.0,10.0,10.0,4459.0,4459.0
mean,0.0,14.4,18.3,72.7,8.6,0.0,6.535995,0.147763
std,0.0,32.948445,14.772723,50.36103,10.068653,0.0,5.679984,0.094337
min,0.0,1.0,4.0,23.0,2.0,0.0,0.0,0.0
25%,0.0,1.0,7.25,37.75,3.0,0.0,3.0,0.090909
50%,0.0,1.5,13.0,57.0,4.0,0.0,5.0,0.142857
75%,0.0,9.0,23.75,88.0,10.75,0.0,9.0,0.2
max,0.0,107.0,48.0,161.0,35.0,0.0,54.0,0.666667


In [67]:
z.loc[z['Spam']==0].describe()

Unnamed: 0,Spam,Capitals,Punctuation,Length,Words,Test,Words_NoPunct,Punct
count,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0
mean,0.0,14.4,18.3,72.7,8.6,0.0,5.5,0.151479
std,0.0,32.948445,14.772723,50.36103,10.068653,0.0,7.412452,0.063396
min,0.0,1.0,4.0,23.0,2.0,0.0,1.0,0.0
25%,0.0,1.0,7.25,37.75,3.0,0.0,2.0,0.130721
50%,0.0,1.5,13.0,57.0,4.0,0.0,2.0,0.166667
75%,0.0,9.0,23.75,88.0,10.75,0.0,6.75,0.2
max,0.0,107.0,48.0,161.0,35.0,0.0,25.0,0.208333


In [68]:
z.loc[z['Spam']==1].describe()

Unnamed: 0,Spam,Capitals,Punctuation,Length,Words,Test,Words_NoPunct,Punct
count,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mean,,,,,,,,
std,,,,,,,,
min,,,,,,,,
25%,,,,,,,,
50%,,,,,,,,
75%,,,,,,,,
max,,,,,,,,


In [69]:
aa = [word_counts_v3(y) for y in x['Message']]

In [70]:
ab = pd.DataFrame(aa)
ab.describe()

Unnamed: 0,Words_NoPunct,Punct
count,10.0,10.0
mean,5.5,0.151479
std,7.412452,0.063396
min,1.0,0.0
25%,2.0,0.130721
50%,2.0,0.166667
75%,6.75,0.2
max,25.0,0.208333


# Lemmatization

In [71]:

text = "Stemming is aimed at reducing vocabulary and aid un-derstanding of" +\
       " morphological processes. This helps people un-derstand the" +\
       " morphology of words and reduce size of corpus."

lemma = en(text)

In [72]:
lemmas = ""
for sentence in lemma.sentences:
        for token in sentence.tokens:
            lemmas += token.words[0].lemma +"/" + \
                    token.words[0].upos + " "
        lemmas += "\n"

print(lemmas)

stemming/NOUN be/AUX aim/VERB at/SCONJ reduce/VERB vocabulary/NOUN and/CCONJ aid/NOUN un/NOUN -/PUNCT derstanding/NOUN of/ADP morphological/ADJ process/NOUN ./PUNCT 
this/PRON help/VERB people/NOUN un/NOUN -/PUNCT derstand/VERB the/DET morphology/NOUN of/ADP word/NOUN and/CCONJ reduce/VERB size/NOUN of/ADP corpus/NOUN ./PUNCT 



# TF-IDF Based Model


In [73]:
# if not installed already
!pip install sklearn



In [79]:
corpus = [
          "I like fruits. Fruits like bananas",
          "I love bananas but eat an apple",
          "An apple a day keeps the doctor away"
]


## Count Vectorization

In [80]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)

vectorizer.get_feature_names()

['an',
 'apple',
 'away',
 'bananas',
 'but',
 'day',
 'doctor',
 'eat',
 'fruits',
 'keeps',
 'like',
 'love',
 'the']

In [81]:
X.toarray()

array([[0, 0, 0, 1, 0, 0, 0, 0, 2, 0, 2, 0, 0],
       [1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0],
       [1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1]])

In [82]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarity(X.toarray())

array([[1.        , 0.13608276, 0.        ],
       [0.13608276, 1.        , 0.3086067 ],
       [0.        , 0.3086067 , 1.        ]])

In [83]:
query = vectorizer.transform(["apple and bananas"])

cosine_similarity(X, query)

array([[0.23570226],
       [0.57735027],
       [0.26726124]])

## TF-IDF Vectorization

In [84]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer

transformer = TfidfTransformer(smooth_idf=False)
tfidf = transformer.fit_transform(X.toarray())

pd.DataFrame(tfidf.toarray(), 
             columns=vectorizer.get_feature_names())

Unnamed: 0,an,apple,away,bananas,but,day,doctor,eat,fruits,keeps,like,love,the
0,0.0,0.0,0.0,0.230408,0.0,0.0,0.0,0.0,0.688081,0.0,0.688081,0.0,0.0
1,0.321267,0.321267,0.0,0.321267,0.479709,0.0,0.0,0.479709,0.0,0.0,0.0,0.479709,0.0
2,0.275785,0.275785,0.411797,0.0,0.0,0.411797,0.411797,0.0,0.0,0.411797,0.0,0.0,0.411797


In [86]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

tfidf = TfidfVectorizer(binary=True)
X = tfidf.fit_transform(train['Message']).astype('float32')
X_test = tfidf.transform(test['Message']).astype('float32')

In [90]:
X.shape

(4459, 7741)

In [93]:
from keras.utils import np_utils

_, cols = X.shape
model2 = make_model(cols)  # to match tf-idf dimensions
lb = LabelEncoder()
y = lb.fit_transform(y_train)
dummy_y_train = np_utils.to_categorical(y)
model2.fit(X.toarray(), y_train, epochs=10, batch_size=10)

  y = column_or_1d(y, warn=True)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f37d1fb7828>

In [94]:
model2.evaluate(X_test.toarray(), y_test)



[0.05765564367175102, 0.9838564991950989]

In [95]:
train.loc[train.Spam == 1].describe() 

Unnamed: 0,Spam,Capitals,Punctuation,Length,Words,Test,Words_NoPunct,Punct
count,592.0,592.0,592.0,592.0,592.0,592.0,592.0,592.0
mean,1.0,15.320946,29.086149,138.856419,18.469595,0.0,14.25,0.138386
std,0.0,11.635105,7.083572,28.07998,6.085607,0.0,4.701046,0.064732
min,1.0,0.0,2.0,13.0,2.0,0.0,2.0,0.0
25%,1.0,7.0,26.0,132.0,14.0,0.0,11.0,0.096774
50%,1.0,14.0,30.0,149.0,19.0,0.0,14.0,0.137931
75%,1.0,21.0,34.0,157.0,23.0,0.0,18.0,0.176471
max,1.0,128.0,49.0,197.0,33.0,0.0,27.0,0.333333


# Word Vectors

In [1]:
# memory limit may be exceeded. Try deleting some objects before running this next section
# or copy this section to a different notebook.
!pip install gensim



In [2]:
from gensim.models.word2vec import Word2Vec
import gensim.downloader as api


In [3]:
api.info()

{'corpora': {'20-newsgroups': {'checksum': 'c92fd4f6640a86d5ba89eaad818a9891',
   'description': 'The notorious collection of approximately 20,000 newsgroup posts, partitioned (nearly) evenly across 20 different newsgroups.',
   'fields': {'data': '',
    'id': 'original id inferred from folder name',
    'set': "marker of original split (possible values 'train' and 'test')",
    'topic': 'name of topic (20 variant of possible values)'},
   'file_name': '20-newsgroups.gz',
   'file_size': 14483581,
   'license': 'not found',
   'num_records': 18846,
   'parts': 1,
   'read_more': ['http://qwone.com/~jason/20Newsgroups/'],
   'reader_code': 'https://github.com/RaRe-Technologies/gensim-data/releases/download/20-newsgroups/__init__.py',
   'record_format': 'dict'},
  '__testing_matrix-synopsis': {'checksum': '1767ac93a089b43899d54944b07d9dc5',
   'description': '[THIS IS ONLY FOR TESTING] Synopsis of the movie matrix.',
   'file_name': '__testing_matrix-synopsis.gz',
   'parts': 1,
   're

In [4]:
model_w2v = api.load("word2vec-google-news-300")

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [5]:
model_w2v.most_similar("cookies",topn=10)

  if np.issubdtype(vec.dtype, np.int):


[('cookie', 0.745154082775116),
 ('oatmeal_raisin_cookies', 0.6887780427932739),
 ('oatmeal_cookies', 0.662139892578125),
 ('cookie_dough_ice_cream', 0.6520504951477051),
 ('brownies', 0.6479344964027405),
 ('homemade_cookies', 0.6476464867591858),
 ('gingerbread_cookies', 0.6461867690086365),
 ('Cookies', 0.6341644525527954),
 ('cookies_cupcakes', 0.6275068521499634),
 ('cupcakes', 0.6258294582366943)]

In [6]:
model_w2v.doesnt_match(["USA","Canada","India","Tokyo"])

  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)
  if np.issubdtype(vec.dtype, np.int):


'Tokyo'

In [7]:
king = model_w2v['king']
man = model_w2v['man']
woman = model_w2v['woman']

queen = king - man + woman  
model_w2v.similar_by_vector(queen)

  if np.issubdtype(vec.dtype, np.int):


[('king', 0.8449392318725586),
 ('queen', 0.7300517559051514),
 ('monarch', 0.6454660892486572),
 ('princess', 0.6156251430511475),
 ('crown_prince', 0.5818676948547363),
 ('prince', 0.5777117609977722),
 ('kings', 0.5613663792610168),
 ('sultan', 0.5376776456832886),
 ('Queen_Consort', 0.5344247817993164),
 ('queens', 0.5289887189865112)]