In [2]:
import pickle
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import Binarizer
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
from google.colab import drive
import matplotlib.pylab as plt
import scipy.sparse as sparse

## **Bag of words Sklearn**

In [3]:
drive.mount("/content/drive")
DATA_PATH = "/content/drive/Othercomputers/Mi portátil/Master/GitHub/twsm-PERSONAL"
infile = open(DATA_PATH+'/Data/Stemmed.pkl','rb')
stemmed = pickle.load(infile)

Mounted at /content/drive


In [20]:
stemmed[0]

'car wonder enlighten car saw dai door sport car look late earli call bricklin door small addit bumper separ rest bodi know tellm model engin spec year product car histori info funki look car mail thank'

In [4]:
vec_abs= CountVectorizer(max_df=0.95, min_df=0.05) #Absolute frequency
vec_rel = TfidfVectorizer(max_df=0.95, min_df=0.05, use_idf=False, norm='l1') # Relative frequency
vec_tf= TfidfVectorizer(max_df=0.95, min_df=0.05, smooth_idf=False) #Tf-IDF frequency

In [22]:
#Absolute
stemmed_abs=vec_abs.fit_transform(stemmed)
#plt.spy(stemmed_abs, markersize=4)
print(stemmed_abs[0])

  (0, 29)	5
  (0, 231)	1
  (0, 49)	1
  (0, 124)	2
  (0, 28)	1
  (0, 191)	1
  (0, 111)	1
  (0, 61)	1
  (0, 239)	1
  (0, 101)	1
  (0, 128)	1
  (0, 208)	1


In [23]:
#Relative
stemmed_rel=vec_rel.fit_transform(stemmed)
print(stemmed_rel[0])

  (0, 29)	0.29411764705882354
  (0, 231)	0.058823529411764705
  (0, 49)	0.058823529411764705
  (0, 124)	0.11764705882352941
  (0, 28)	0.058823529411764705
  (0, 191)	0.058823529411764705
  (0, 111)	0.058823529411764705
  (0, 61)	0.058823529411764705
  (0, 239)	0.058823529411764705
  (0, 101)	0.058823529411764705
  (0, 128)	0.058823529411764705
  (0, 208)	0.058823529411764705


In [24]:
#TF
stemmed_tf=vec_tf.fit_transform(stemmed)
print(stemmed_tf[0])

  (0, 208)	0.12199712560617264
  (0, 128)	0.13921222964904834
  (0, 101)	0.1732318806371618
  (0, 239)	0.12184290375366709
  (0, 61)	0.17193922784588114
  (0, 111)	0.09734612404276298
  (0, 191)	0.17674803309806392
  (0, 28)	0.14901283059698367
  (0, 124)	0.23670601956135015
  (0, 49)	0.14009491139304706
  (0, 231)	0.16984281993048347
  (0, 29)	0.8508200247961197


In [8]:
#dic = {idx: val for idx, val in enumerate(stemmed_tf, start = 1)}

In [9]:
# Two-gram absolute transformer (min=max=2 words)
bigram_vectorizer = CountVectorizer(ngram_range=(2, 2), max_df=2, min_df=0)

stemmed_tf_bigram = bigram_vectorizer.fit_transform(stemmed)
bigrams = bigram_vectorizer.get_feature_names()
print(bigrams)


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [10]:
print(len(bigrams))

696839


In [11]:
print(bigrams[0:20])

['aa aa', 'aa aiaa', 'aa american', 'aa busi', 'aa develop', 'aa wvgpd', 'aaa batteri', 'aaa better', 'aaa bronx', 'aaa cal', 'aaa catcher', 'aaa contract', 'aaa david', 'aaa deion', 'aaa effect', 'aaa essenti', 'aaa fido', 'aaa glare', 'aaa go', 'aaa histori']


In [12]:
#Binarizer
transformer = Binarizer().fit(stemmed_abs)  # fit does nothing.
transformer

transformer.transform(stemmed_abs)
print(stemmed_abs[0:1])

  (0, 29)	5
  (0, 231)	1
  (0, 49)	1
  (0, 124)	2
  (0, 28)	1
  (0, 191)	1
  (0, 111)	1
  (0, 61)	1
  (0, 239)	1
  (0, 101)	1
  (0, 128)	1
  (0, 208)	1


## **Bag of words Gensim**

In [19]:
corpus_gen = [doc.split() for doc in stemmed]
corpus_gen[0]

['car',
 'wonder',
 'enlighten',
 'car',
 'saw',
 'dai',
 'door',
 'sport',
 'car',
 'look',
 'late',
 'earli',
 'call',
 'bricklin',
 'door',
 'small',
 'addit',
 'bumper',
 'separ',
 'rest',
 'bodi',
 'know',
 'tellm',
 'model',
 'engin',
 'spec',
 'year',
 'product',
 'car',
 'histori',
 'info',
 'funki',
 'look',
 'car',
 'mail',
 'thank']

The doc.split() is separating all the words in a different row of a list.

In [36]:
id2word = Dictionary(corpus_gen)

In [37]:
id2word.filter_extremes(no_below=566, no_above=0.95)
print(id2word)

Dictionary(240 unique tokens: ['call', 'car', 'dai', 'engin', 'info']...)


It's filtering for words that appear more than 566 times. There are 240 words that fulfill this filter

In [38]:
#i
print(id2word.token2id)

{'call': 0, 'car': 1, 'dai': 2, 'engin': 3, 'info': 4, 'know': 5, 'look': 6, 'mail': 7, 'small': 8, 'thank': 9, 'wonder': 10, 'year': 11, 'answer': 12, 'base': 13, 'card': 14, 'edu': 15, 'experi': 16, 'final': 17, 'gui': 18, 'messag': 19, 'number': 20, 'report': 21, 'send': 22, 'actual': 23, 'advanc': 24, 'anybodi': 25, 'better': 26, 'bit': 27, 'email': 28, 'expect': 29, 'feel': 30, 'good': 31, 'got': 32, 'great': 33, 'heard': 34, 'help': 35, 'life': 36, 'like': 37, 'line': 38, 'machin': 39, 'mayb': 40, 'new': 41, 'opinion': 42, 'peopl': 43, 'plai': 44, 'post': 45, 'price': 46, 'probabl': 47, 'question': 48, 'read': 49, 'real': 50, 'recent': 51, 'start': 52, 'take': 53, 'time': 54, 'us': 55, 'wai': 56, 'address': 57, 'articl': 58, 'chip': 59, 'com': 60, 'far': 61, 'inform': 62, 'person': 63, 'phone': 64, 'point': 65, 'pretti': 66, 'requir': 67, 'stuff': 68, 'system': 69, 'thing': 70, 'write': 71, 'wrote': 72, 'check': 73, 'mean': 74, 'possibl': 75, 'right': 76, 'set': 77, 'softwar': 78

In [39]:
#ii
print(id2word.token2id.keys())

dict_keys(['call', 'car', 'dai', 'engin', 'info', 'know', 'look', 'mail', 'small', 'thank', 'wonder', 'year', 'answer', 'base', 'card', 'edu', 'experi', 'final', 'gui', 'messag', 'number', 'report', 'send', 'actual', 'advanc', 'anybodi', 'better', 'bit', 'email', 'expect', 'feel', 'good', 'got', 'great', 'heard', 'help', 'life', 'like', 'line', 'machin', 'mayb', 'new', 'opinion', 'peopl', 'plai', 'post', 'price', 'probabl', 'question', 'read', 'real', 'recent', 'start', 'take', 'time', 'us', 'wai', 'address', 'articl', 'chip', 'com', 'far', 'inform', 'person', 'phone', 'point', 'pretti', 'requir', 'stuff', 'system', 'thing', 'write', 'wrote', 'check', 'mean', 'possibl', 'right', 'set', 'softwar', 'tell', 'understand', 'world', 'ye', 'agre', 'allow', 'apr', 'believ', 'come', 'consid', 'control', 'cost', 'cours', 'exist', 'follow', 'given', 'govern', 'hand', 'hard', 'hope', 'idea', 'john', 'kill', 'make', 'need', 'non', 'power', 'reason', 'result', 'sai', 'second', 'state', 'support', 't

In [40]:
#iii
print(id2word.dfs)

{1: 697, 10: 702, 2: 1361, 6: 2208, 0: 1116, 8: 602, 5: 3524, 3: 670, 11: 2043, 4: 651, 7: 1388, 9: 2036, 17: 576, 21: 601, 20: 1174, 16: 627, 22: 740, 19: 819, 14: 703, 13: 895, 12: 767, 18: 595, 15: 5726, 48: 1775, 52: 1229, 36: 725, 56: 2218, 41: 2924, 39: 609, 27: 1002, 40: 806, 25: 567, 29: 581, 34: 726, 46: 613, 38: 990, 37: 3848, 51: 629, 47: 1116, 32: 1223, 30: 705, 26: 1215, 33: 965, 31: 2334, 42: 1059, 43: 2549, 55: 2514, 53: 638, 50: 928, 44: 769, 23: 1095, 35: 1621, 24: 650, 28: 775, 45: 1800, 49: 1488, 54: 2822, 72: 808, 71: 6081, 58: 4988, 59: 582, 61: 958, 68: 571, 66: 651, 67: 729, 65: 1570, 57: 632, 64: 716, 62: 1112, 60: 3759, 69: 599, 70: 2070, 63: 1244, 81: 1023, 82: 772, 74: 1484, 80: 729, 78: 824, 73: 650, 76: 1852, 77: 990, 75: 1206, 79: 1210, 109: 865, 99: 953, 85: 2841, 100: 743, 105: 963, 102: 1108, 90: 587, 103: 2175, 89: 926, 95: 846, 107: 686, 104: 748, 92: 829, 110: 1354, 87: 1668, 86: 1434, 97: 847, 111: 964, 83: 662, 96: 787, 98: 771, 91: 1025, 108: 1287

i. Is printing the words with their corresponding index in the dictionary

ii. Is printing only the words

iii. Is printing how many documents have the corresponding word

In [43]:
#iiii
corpus1=[id2word.doc2bow(doc) for doc in corpus_gen]
corpus1[0]

[(0, 1),
 (1, 5),
 (2, 1),
 (3, 1),
 (4, 1),
 (5, 1),
 (6, 2),
 (7, 1),
 (8, 1),
 (9, 1),
 (10, 1),
 (11, 1)]

iiii. doc2bow converts document into the bag-of-words (BoW) format = list of (token_id, token_count) tuples. It's the absolute frequency.

In [45]:
#v
corpus2=[[(token[0],(token[1]/sum(n for _, n in doc))) for token in doc]
for doc in corpus1]
corpus2[0]

[(0, 0.058823529411764705),
 (1, 0.29411764705882354),
 (2, 0.058823529411764705),
 (3, 0.058823529411764705),
 (4, 0.058823529411764705),
 (5, 0.058823529411764705),
 (6, 0.11764705882352941),
 (7, 0.058823529411764705),
 (8, 0.058823529411764705),
 (9, 0.058823529411764705),
 (10, 0.058823529411764705),
 (11, 0.058823529411764705)]

v. It's the Bag-of-words: Relative frequency

In [46]:
#vi
corpus3=[[(token[0],1) for token in doc] for doc in corpus1]
corpus3[0]

[(0, 1),
 (1, 1),
 (2, 1),
 (3, 1),
 (4, 1),
 (5, 1),
 (6, 1),
 (7, 1),
 (8, 1),
 (9, 1),
 (10, 1),
 (11, 1)]

vi. Represents the Bag-of-words: One-hot-encoding approach

In [47]:
#vii
tfidf=TfidfModel(dictionary=id2word, normalize=True)
corpus4=[tfidf[id2word.doc2bow(doc)] for doc in corpus_gen]
corpus4[0]

[(0, 0.14397605792925167),
 (1, 0.8661756897052311),
 (2, 0.1316396218024463),
 (3, 0.17569085919746014),
 (4, 0.17747902617106662),
 (5, 0.07250388928623666),
 (6, 0.20312674067620154),
 (7, 0.13041858058126876),
 (8, 0.18234302967391078),
 (9, 0.1066043895592759),
 (10, 0.17279083266070747),
 (11, 0.10639104965460928)]

vii. Represents the Bag-of-words: TF-IDF frequency approach

## **Ngrams**

In [48]:
Ngrams = CountVectorizer(ngram_range=(2, 2), max_df=0.95, min_df=0.05)

In [50]:
#Ngrams transformation
Ngrams_transf = Ngrams.fit_transform(stemmed)
print(Ngrams_transf[0:5])

  (3, 1)	1
  (3, 2)	1
