# Transforming html to txt

<p>First, we need to extract the text from the pages. For this, we gonna use the Beautiful Soup parser in each page from local directory - the positives and negatives. We extract from paragraphs of html and append the results. Some pages has irrelevant information, like ads - we will try to remove after this.</p>
<p>After append the strings, we append the result (1 or 0), 1 if is a relevant page and 0 otherwise.</p>

In [2]:
import os
import time
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup as soup
from string import punctuation
import nltk

In [3]:
nltk.download('stopwords')
nltk.download('rslp')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/lavinia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package rslp to /home/lavinia/nltk_data...
[nltk_data]   Package rslp is already up-to-date!


True

In [4]:
path = "pages/positivos"

parsedPositive = []
parsed = []
y = []
stemmer = nltk.stem.RSLPStemmer()

for filename in os.listdir(path):
    fullpath = os.path.join(path, filename)
    if fullpath.endswith('.html'):
        page = soup(open(fullpath),'html.parser')
        p_tags = page.find_all('p')
        s = ""
        for x in p_tags:
            s += x.text + " "
        for i in range(1,6):
            h_tags = page.find_all('h'+str(i))
            for x in h_tags:
                s += x.text + " "
        s += page.title.string + " "
        dictionary = list(punctuation)
        dictionary.append('placeholder')
        #dictionary += nltk.corpus.stopwords.words('portuguese')
        l = [i for i in s if not i.isdigit() and i not in dictionary]
        s = ''.join(l)
        new_text = ""
        s = s.split()
        for i in s:
            if i not in nltk.corpus.stopwords.words('portuguese'):
                new_text += i + " "
        new_text = new_text.lower()
        parsed.append(new_text);
        parsedPositive.append(new_text);
        y.append(1);

In [5]:
path = "pages/negativos"

parsedNegative = []

for filename in os.listdir(path):
    fullpath = os.path.join(path, filename)
    if fullpath.endswith('.html'):
        page = soup(open(fullpath),'html.parser')
        p_tags = page.find_all('p')
        s = ""
        for x in p_tags:
            s += x.text + " "
        for i in range(1,6):
            h_tags = page.find_all('h'+str(i))
            for x in h_tags:
                s += x.text + " "
        s += page.title.string + " "
        dictionary = list(punctuation)
        dictionary.append('placeholder')
        #dictionary += nltk.corpus.stopwords.words('portuguese')
        l = [i for i in s if not i.isdigit() and i not in dictionary]
        s = ''.join(l)
        new_text = ""
        s = s.split()
        for i in s:
            if i not in nltk.corpus.stopwords.words('portuguese'):
                new_text += i + " "
        new_text = new_text.lower()
        parsed.append(new_text);
        y.append(0);

In [3]:
def parse_samples(path, parsed_list, y, type_of_sample):
    for filename in os.listdir(path):
        full_path = os.path.join(path, filename)
        if full_path.endswith('.html'):
            s = get_text_from_html(full_path)
            parsed_list.append(s)
            y.append(type_of_sample)

def get_text_from_html(path, remove_stopwords = True, stemming_enabled = False):
    f = open(path)
    soup = BeautifulSoup(f, 'html.parser')
    p_tags = soup.find_all('p')
    s = []
    for x in p_tags:
        s += x.text.split(' ')
    for i in range(1,6):
        h_tags = soup.find_all('h'+str(i))
        for x in h_tags:
            s += x.text.split(' ')
    s += soup.title.string.split(' ')
    f.close()
    dictionary = list(punctuation)
    dictionary.append('placeholder')
    if remove_stopwords:
        dictionary += list(nltk.corpus.stopwords.words('portuguese'))
    s = [i.lower() for i in s]
    l = [i for i in s if not i.isdigit() and i not in dictionary and len(i) > 0]
    if stemming_enabled:
        stemmer = nltk.stem.RSLPStemmer()
        l = [stemmer.stem(i) for i in l if len(i) > 0]
    s = ' '.join(l)
    return s

# Feature Extraction and Classifier
<p>After getting the parsed text and put the tag on each test, we gonna use SkLearn to make the feature extraction with Bag of Words (CountVectorizer), transforming on frequencies with TF-IDF(TfidfTransformer) and using different classifiers. We also using the pipeline to reduce the steps and be more easy to do.</p>

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

## Used Classifiers

<p>SGClassifier -> Stochastic Gradient Descent</p>
<p>MultinomialNB -> Naive Bayes</p>
<p>DecisionTreeClassifier -> Arvore de Decisao</p>
<p>LogisticRegression -> Logistic Regression</p>
<p>MLPClassifier -> Multilayer Perceptron Classifier</p>
<p>SVC -> Support Vector Classification</p>

In [44]:
skf = StratifiedKFold(n_splits=15)
text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])
text_clf2 = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', SGDClassifier(loss='hinge',penalty='l1', max_iter=1000, tol=None))])
text_clf3 = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', DecisionTreeClassifier())])
text_clf4 = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', LogisticRegression())])
text_clf5 = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MLPClassifier())])
text_clf6 = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', SVC())])


In [45]:
vectorizer = TfidfVectorizer()
vectorizer.fit(parsed)
print(vectorizer.vocabulary_)
print(vectorizer.idf_)

{'grudado': 5468, 'delicada': 3163, 'bojudos': 1452, 'estarão': 4343, 'cobrir': 2238, 'postei': 8489, 'tabstopspt': 10401, 'grelhadas': 5438, 'ficando': 4755, 'finalizar': 4799, 'ótima': 11498, 'boca': 1442, 'experimentar': 4488, 'terminarão': 10592, 'duradoura': 3749, 'escumadeira': 4167, 'modelas': 7143, 'revela': 9493, 'bagunça': 1155, 'ostras': 7794, 'miha': 7034, 'lenir': 6382, 'juntarem': 6190, 'clássicos': 2211, 'artesanal': 918, 'varie': 11121, 'vendidos': 11193, 'intercalar': 5991, 'rico': 9509, 'copo': 2700, 'amo': 608, 'aprovadissimo': 805, 'petropolitano': 8242, 'região': 9310, 'sangue': 9755, 'mas': 6853, 'pinoles': 8315, 'glucose': 5315, 'descascado': 3281, 'puxei': 8885, 'brothers': 1586, 'uvapassa': 11077, 'dou': 3691, 'intuição': 6024, 'bordo': 1510, 'adicionei': 228, 'torradinhos': 10749, 'banda': 1195, 'con': 2477, 'oleo': 7701, 'estragar': 4368, 'fiapos': 4747, 'acharam': 112, 'permitir': 8190, 'butantan': 1625, 'abóbora': 67, 'virgem': 11305, 'forno': 4946, 'remete

In [46]:
acuracia_tot = []
tempo_tot = []
precisao_tot = []
recall_tot = []

for i in range(1,16):
    print ('Epoch: ', i)
    X_train, X_test, y_train, y_test = train_test_split(parsed, y, test_size=0.3, random_state=None)
    classifiers = [text_clf, text_clf2, text_clf3, text_clf4, text_clf5, text_clf6]
    acuracia = []
    tempo = []
    precisao = []
    recall = []
    for classifier in classifiers:
        time1 = time.time()
        classifier.fit(X_train,y_train)
        time2 = time.time()
        y_pred = classifier.predict(X_test)
        score = classifier.score(X_test, y_test)
        precision = precision_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))
        rec = recall_score(y_test, y_pred,  average='weighted', labels=np.unique(y_pred))
        acuracia.append(score)
        tempo.append(time2-time1)
        precisao.append(precision)
        recall.append(rec)
    acuracia_tot.append(acuracia)
    tempo_tot.append(tempo)
    precisao_tot.append(precisao)
    recall_tot.append(recall)

Epoch:  1
Epoch:  2
Epoch:  3
Epoch:  4
Epoch:  5
Epoch:  6
Epoch:  7
Epoch:  8
Epoch:  9
Epoch:  10
Epoch:  11
Epoch:  12
Epoch:  13
Epoch:  14
Epoch:  15


In [47]:
col = ['SGC', 'Naive Bayes', 'DecisionTree', 'LogisticRegression', 'MLP', 'SVC']
acuracia_pd = pd.DataFrame(acuracia_tot, columns = col)
acuracia_pd

Unnamed: 0,SGC,Naive Bayes,DecisionTree,LogisticRegression,MLP,SVC
0,0.916084,0.944056,0.920746,0.941725,0.939394,0.568765
1,0.892774,0.93007,0.899767,0.925408,0.923077,0.610723
2,0.90676,0.944056,0.909091,0.941725,0.937063,0.606061
3,0.876457,0.927739,0.911422,0.920746,0.920746,0.538462
4,0.916084,0.939394,0.93007,0.939394,0.927739,0.582751
5,0.892774,0.923077,0.892774,0.920746,0.932401,0.564103
6,0.899767,0.946387,0.916084,0.932401,0.918415,0.589744
7,0.916084,0.951049,0.934732,0.944056,0.944056,0.617716
8,0.934732,0.946387,0.902098,0.934732,0.941725,0.599068
9,0.916084,0.958042,0.913753,0.941725,0.95338,0.58042


In [48]:
print(np.mean(acuracia_pd))

SGC                   0.908469
Naive Bayes           0.941414
DecisionTree          0.912044
LogisticRegression    0.936131
MLP                   0.934732
SVC                   0.590054
dtype: float64


In [49]:
tempo_pd = pd.DataFrame(tempo_tot, columns = col)
tempo_pd

Unnamed: 0,SGC,Naive Bayes,DecisionTree,LogisticRegression,MLP,SVC
0,0.242331,1.294594,0.342698,0.211186,20.763797,1.362801
1,0.177272,1.287236,0.287124,0.282831,21.56827,1.440693
2,0.174865,1.28729,0.434256,0.204044,26.630507,1.396608
3,0.171772,0.919509,0.298811,0.184703,22.257866,1.79493
4,0.288602,1.313635,0.439535,0.185574,18.858614,1.383149
5,0.196293,1.010186,0.319564,0.189848,19.518736,1.357934
6,0.176002,0.990752,0.314707,0.187774,18.568867,1.712948
7,0.17585,0.983686,0.322348,0.187178,20.521147,1.397529
8,0.173885,0.975903,0.299511,0.187084,20.60644,1.376766
9,0.172386,0.980472,0.244623,0.183773,20.707637,1.392814


In [50]:
print(np.mean(tempo_pd))

SGC                    0.191540
Naive Bayes            1.130086
DecisionTree           0.328870
LogisticRegression     0.204795
MLP                   22.107545
SVC                    1.514832
dtype: float64


In [51]:
precisao_pd = pd.DataFrame(precisao_tot, columns = col)
precisao_pd

Unnamed: 0,SGC,Naive Bayes,DecisionTree,LogisticRegression,MLP,SVC
0,0.918147,0.944067,0.92093,0.941773,0.939394,0.568765
1,0.892393,0.931779,0.90096,0.929086,0.925732,0.610723
2,0.907239,0.944759,0.908891,0.94306,0.940197,0.606061
3,0.884324,0.927774,0.911555,0.921007,0.921007,0.538462
4,0.917441,0.939394,0.930219,0.940396,0.9295,0.582751
5,0.89898,0.923133,0.892808,0.920879,0.932369,0.564103
6,0.90093,0.946615,0.916848,0.93419,0.918998,0.589744
7,0.917501,0.951002,0.935191,0.944056,0.943966,0.617716
8,0.934862,0.946631,0.902322,0.936316,0.943019,0.599068
9,0.918606,0.958042,0.91374,0.942514,0.954733,0.58042


In [52]:
print(np.mean(precisao_pd))

SGC                   0.910629
Naive Bayes           0.941868
DecisionTree          0.912425
LogisticRegression    0.937263
MLP                   0.936056
SVC                   0.590054
dtype: float64


In [53]:
recall_pd = pd.DataFrame(recall_tot, columns = col)
recall_pd

Unnamed: 0,SGC,Naive Bayes,DecisionTree,LogisticRegression,MLP,SVC
0,0.916084,0.944056,0.920746,0.941725,0.939394,1.0
1,0.892774,0.93007,0.899767,0.925408,0.923077,1.0
2,0.90676,0.944056,0.909091,0.941725,0.937063,1.0
3,0.876457,0.927739,0.911422,0.920746,0.920746,1.0
4,0.916084,0.939394,0.93007,0.939394,0.927739,1.0
5,0.892774,0.923077,0.892774,0.920746,0.932401,1.0
6,0.899767,0.946387,0.916084,0.932401,0.918415,1.0
7,0.916084,0.951049,0.934732,0.944056,0.944056,1.0
8,0.934732,0.946387,0.902098,0.934732,0.941725,1.0
9,0.916084,0.958042,0.913753,0.941725,0.95338,1.0


In [54]:
print(np.mean(recall_pd))

SGC                   0.908469
Naive Bayes           0.941414
DecisionTree          0.912044
LogisticRegression    0.936131
MLP                   0.934732
SVC                   1.000000
dtype: float64


# Feature Selection

Testando selecionar as features previamentes para verificar se houve melhora no classificador


In [58]:
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_extraction.text import CountVectorizer

In [59]:
cv = CountVectorizer(max_df=0.9, max_features=1000)
X_vec = cv.fit_transform(X_train)
res = dict(zip(mutual_info_classif(X_vec, y_train, discrete_features=True),cv.get_feature_names()
               ))
res

{3.9550022923745543e-06: 'tudogostoso',
 1.2646656471562712e-05: 'página',
 4.0134194189411776e-05: 'distribua',
 6.240460745374557e-05: 'ajuda',
 8.7678387259478713e-05: 'cortar',
 0.00010967803419872912: 'uma',
 0.00015356631328914544: 'prepare',
 0.00016905624962617813: 'mole',
 0.00019303229337309935: 'tampe',
 0.00021852467732124022: 'através',
 0.00023211834305566365: 'sugestão',
 0.00036136120800692867: 'fiquei',
 0.00047810201287182968: 'irá',
 0.00052779384336877708: 'as',
 0.00053761857947659964: 'misturar',
 0.00056293975387581262: 'formar',
 0.00056317167835059564: 'resultado',
 0.00060883992114056304: 'mão',
 0.00063304742840881913: 'basta',
 0.00066524264263034816: 'quase',
 0.00080025414071126233: 'tanto',
 0.0008071883221838624: 'polpa',
 0.00082691607247838139: 'post',
 0.00090740784915170625: 'rápido',
 0.00098199736103248003: 'nascem',
 0.0010128031729214988: 'palito',
 0.0010879631043163585: 'fáceis',
 0.0011016678589490445: 'frite',
 0.001121470092354887: 'comparti

In [57]:
text_clf = Pipeline([('vect', cv), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])
text_clf2 = Pipeline([('vect', cv), ('tfidf', TfidfTransformer()), ('clf', SGDClassifier(loss='hinge',penalty='l1', max_iter=1000, tol=None))])
text_clf3 = Pipeline([('vect', cv), ('tfidf', TfidfTransformer()), ('clf', DecisionTreeClassifier())])
text_clf4 = Pipeline([('vect', cv), ('tfidf', TfidfTransformer()), ('clf', LogisticRegression())])
text_clf5 = Pipeline([('vect', cv), ('tfidf', TfidfTransformer()), ('clf', MLPClassifier())])
text_clf6 = Pipeline([('vect', cv), ('tfidf', TfidfTransformer()), ('clf', SVC())])

In [24]:
acuracia_tot2 = []
tempo_tot2 = []
precisao_tot2 = []
recall_tot2 = []

for i in range(1,16):
    print ('Epoch: ', i)
    X_train, X_test, y_train, y_test = train_test_split(parsed, y, test_size=0.3, random_state=None)
    classifiers = [text_clf, text_clf2, text_clf3, text_clf4, text_clf5, text_clf6]
    acuracia = []
    tempo = []
    precisao = []
    recall = []
    for classifier in classifiers:
        time1 = time.time()
        classifier.fit(X_train,y_train)
        time2 = time.time()
        y_pred = classifier.predict(X_test)
        score = classifier.score(X_test, y_test)
        precision = precision_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))
        rec = recall_score(y_test, y_pred,  average='weighted', labels=np.unique(y_pred))
        acuracia.append(score)
        tempo.append(time2-time1)
        precisao.append(precision)
        recall.append(rec)
    acuracia_tot2.append(acuracia)
    tempo_tot2.append(tempo)
    precisao_tot2.append(precisao)
    recall_tot2.append(recall)

Epoch:  1
Epoch:  2
Epoch:  3
Epoch:  4
Epoch:  5
Epoch:  6
Epoch:  7
Epoch:  8
Epoch:  9
Epoch:  10
Epoch:  11
Epoch:  12
Epoch:  13
Epoch:  14
Epoch:  15


In [52]:
col = ['SGC', 'Naive Bayes', 'DecisionTree', 'LogisticRegression', 'MLP', 'SVC']
acuracia_pd2 = pd.DataFrame(acuracia_tot2, columns = col)
acuracia_pd2

Unnamed: 0,SGC,Naive Bayes,DecisionTree,LogisticRegression,MLP,SVC
0,0.677966,0.677966,0.669492,0.694915,0.711864,0.635593
1,0.635593,0.720339,0.601695,0.677966,0.694915,0.661017
2,0.661017,0.745763,0.661017,0.669492,0.728814,0.652542
3,0.686441,0.728814,0.618644,0.70339,0.754237,0.644068
4,0.737288,0.779661,0.720339,0.745763,0.805085,0.70339
5,0.669492,0.711864,0.686441,0.669492,0.720339,0.652542
6,0.70339,0.711864,0.644068,0.711864,0.737288,0.669492
7,0.677966,0.70339,0.652542,0.720339,0.694915,0.669492
8,0.677966,0.745763,0.70339,0.694915,0.805085,0.661017
9,0.644068,0.79661,0.627119,0.652542,0.805085,0.59322


In [53]:
print(np.mean(acuracia_pd2))

SGC                   0.689831
Naive Bayes           0.740678
DecisionTree          0.658757
LogisticRegression    0.706780
MLP                   0.749718
SVC                   0.665537
dtype: float64


In [54]:
tempo_pd2 = pd.DataFrame(tempo_tot2, columns = col)
tempo_pd2

Unnamed: 0,SGC,Naive Bayes,DecisionTree,LogisticRegression,MLP,SVC
0,0.085007,0.279718,0.118544,0.064304,6.070609,0.153494
1,0.060738,0.295795,0.101674,0.061634,6.061091,0.149521
2,0.059413,0.276343,0.090178,0.061493,5.534863,0.149536
3,0.060333,0.281294,0.09206,0.061951,6.006109,0.149856
4,0.061603,0.288482,0.128693,0.063151,6.015458,0.157222
5,0.059606,0.274176,0.103018,0.061838,5.930658,0.151713
6,0.059107,0.280988,0.095521,0.061366,5.885871,0.148424
7,0.060083,0.288757,0.110696,0.062036,6.113147,0.152964
8,0.059129,0.271849,0.083115,0.060814,5.717072,0.146601
9,0.059438,0.287933,0.099498,0.061567,5.750058,0.145534


In [55]:
print(np.mean(tempo_pd2))

SGC                   0.061559
Naive Bayes           0.283941
DecisionTree          0.104093
LogisticRegression    0.062127
MLP                   5.916104
SVC                   0.151598
dtype: float64


In [27]:
precisao_pd2 = pd.DataFrame(precisao_tot2, columns = col)
print(precisao_pd2)
print(np.mean(precisao_pd2))

         SGC  Naive Bayes  DecisionTree  LogisticRegression       MLP  \
0   0.736077     0.677966      0.678622            0.731780  0.702982   
1   0.571056     0.734721      0.619063            0.651851  0.688117   
2   0.630056     0.739356      0.665001            0.649545  0.719198   
3   0.714093     0.726152      0.601269            0.704007  0.748458   
4   0.728657     0.783563      0.713743            0.730404  0.798023   
5   0.656144     0.699571      0.669115            0.649545  0.710632   
6   0.702480     0.705096      0.635059            0.707022  0.725747   
7   0.643290     0.689407      0.629533            0.718884  0.674605   
8   0.700714     0.738712      0.690692            0.736935  0.806475   
9   0.777542     0.801984      0.615115            0.709820  0.824262   
10  0.711963     0.806867      0.649153            0.746610  0.827909   
11  0.741615     0.724499      0.702547            0.752584  0.759026   
12  0.741615     0.771075      0.676152            

In [28]:
recall_pd2 = pd.DataFrame(recall_tot2, columns = col)
print(recall_pd2)
print(np.mean(recall_pd2))

         SGC  Naive Bayes  DecisionTree  LogisticRegression       MLP  SVC
0   0.677966     0.677966      0.669492            0.694915  0.711864  1.0
1   0.635593     0.720339      0.601695            0.677966  0.694915  1.0
2   0.661017     0.745763      0.661017            0.669492  0.728814  1.0
3   0.686441     0.728814      0.618644            0.703390  0.754237  1.0
4   0.737288     0.779661      0.720339            0.745763  0.805085  1.0
5   0.669492     0.711864      0.686441            0.669492  0.720339  1.0
6   0.703390     0.711864      0.644068            0.711864  0.737288  1.0
7   0.677966     0.703390      0.652542            0.720339  0.694915  1.0
8   0.677966     0.745763      0.703390            0.694915  0.805085  1.0
9   0.644068     0.796610      0.627119            0.652542  0.805085  1.0
10  0.737288     0.805085      0.661017            0.762712  0.830508  1.0
11  0.745763     0.728814      0.720339            0.762712  0.762712  1.0
12  0.745763     0.754237

In [29]:
print(np.mean(acuracia_pd2) - np.mean(acuracia_pd))

SGC                  -0.004520
Naive Bayes          -0.014124
DecisionTree         -0.015254
LogisticRegression   -0.001695
MLP                  -0.010169
SVC                  -0.010169
dtype: float64


In [21]:
from collections import Counter

new_list = []
for i in parsed:
    new_list += i.split()
Counter = Counter(new_list)
most_occur = Counter.most_common(100)
print(most_occur)

[('receit', 7881), ('é', 4042), ('minut', 3827), ('xíc', 3067), ('prepar', 3025), ('chá', 3016), ('gost', 2838), ('sal', 2717), ('fic', 2597), ('ingredi', 2527), ('coloqu', 2470), ('águ', 2470), ('colh', 2448), ('bem', 2143), ('deix', 2138), ('sobr', 2113), ('pic', 2081), ('leit', 2061), ('faz', 2024), ('vegan', 2019), ('cozinh', 1945), ('bol', 1935), ('pod', 1934), ('molh', 1909), ('sop', 1889), ('temp', 1811), ('mistur', 1705), ('cebol', 1661), ('blog', 1612), ('azeit', 1545), ('lev', 1543), ('mass', 1537), ('adicion', 1532), ('vegetari', 1490), ('carn', 1488), ('em', 1483), ('test', 1457), ('frang', 1448), ('panel', 1445), ('cerc', 1443), ('outr', 1442), ('est', 1384), ('princip', 1359), ('crem', 1338), ('médi', 1319), ('acrescent', 1291), ('aprend', 1282), ('acompanh', 1281), ('doc', 1274), ('alh', 1267), ('arroz', 1258), ('far', 1213), ('forn', 1204), ('óle', 1193), ('prat', 1170), ('cas', 1143), ('mod', 1138), ('refog', 1131), ('queij', 1123), ('aqu', 1122), ('tod', 1107), ('se',

In [25]:
from collections import Counter

new_list = []
for i in parsed:
    new_list += i.split()
Counter = Counter(new_list)
most_occur = Counter.most_common(100)
print(most_occur)

[('receita', 4759), ('é', 4042), ('receitas', 3057), ('chá', 3014), ('xícara', 2503), ('coloque', 2470), ('água', 2470), ('ingredientes', 2451), ('minutos', 2446), ('sal', 2145), ('bem', 2143), ('preparo', 2088), ('sobre', 2029), ('leite', 1994), ('sopa', 1832), ('fazer', 1750), ('molho', 1688), ('gosto', 1608), ('deixe', 1607), ('pode', 1506), ('em', 1483), ('cerca', 1442), ('massa', 1430), ('frango', 1429), ('panela', 1428), ('carne', 1383), ('azeite', 1380), ('misture', 1365), ('cozinhar', 1364), ('adicione', 1360), ('minuto', 1343), ('colher', 1325), ('principalmente', 1314), ('cebola', 1268), ('arroz', 1258), ('médio', 1251), ('alho', 1229), ('forno', 1198), ('óleo', 1193), ('farinha', 1168), ('acrescente', 1150), ('colheres', 1120), ('aqui', 1119), ('modo', 1095), ('se', 1090), ('fogo', 1019), ('queijo', 1014), ('email', 1011), ('creme', 1005), ('bolo', 1004), ('tempo', 983), ('açúcar', 977), ('blog', 922), ('ficar', 917), ('humm', 913), ('fácil', 897), ('glúten', 883), ('acompan

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

In [45]:
vectorizer  = CountVectorizer()
docindex = vectorizer.fit_transform(parsed)
termindex = docindex.transpose()
print(termindex)

  (5130, 0)	1
  (10064, 0)	1
  (2368, 0)	1
  (4163, 0)	1
  (4965, 0)	1
  (10220, 0)	1
  (2975, 0)	1
  (4499, 0)	1
  (7972, 0)	1
  (780, 0)	1
  (8387, 0)	1
  (7731, 0)	1
  (772, 0)	1
  (5421, 0)	1
  (1163, 0)	1
  (2680, 0)	1
  (1926, 0)	1
  (10274, 0)	1
  (5379, 0)	1
  (753, 0)	1
  (1475, 0)	1
  (5977, 0)	1
  (5553, 0)	1
  (11149, 0)	1
  (5387, 0)	1
  :	:
  (4842, 1427)	1
  (2822, 1427)	1
  (9179, 1427)	4
  (906, 1427)	5
  (10033, 1427)	1
  (7006, 1427)	3
  (8507, 1427)	1
  (10141, 1427)	1
  (2278, 1427)	1
  (226, 1427)	6
  (3232, 1427)	2
  (4758, 1427)	1
  (3147, 1427)	1
  (8378, 1427)	1
  (7875, 1427)	3
  (1710, 1427)	2
  (5657, 1427)	1
  (11246, 1427)	1
  (7008, 1427)	2
  (7083, 1427)	3
  (7112, 1427)	2
  (1946, 1427)	1
  (4652, 1427)	2
  (9850, 1427)	1
  (10695, 1427)	2


In [43]:
from collections import defaultdict
from collections import Counter

#Recebe o texto ja limpo, o i funciona como o index do documento (caso já tenha, é só ler o nome do arquivo
inv_indx = defaultdict(list)
i = 0;
for idx in parsed:
    text = idx.split()
    freq = Counter(text)
    for word in freq:
        inv_indx[word].append((freq[word], i))
    i += 1
    
for word in inv_indx:
    inv_indx[word].sort(reverse=True)
    

In [40]:
#método com compressão, frequencia vem depois do index
inv_indx = defaultdict(list)
data= {}
i = 0;
for idx in parsed:
    text = idx.split()
    freq = Counter(text)
    for word in freq:
        inv_indx[word].append((i, freq[word]))
    i += 1
    
for word in inv_indx:
    inv_indx[word].sort(reverse=False)
    interval = []
    prev = 0
    acc = 0
    for x in inv_indx[word]:
        acc = x[0] - prev
        prev = x[0]
        interval.append((acc, x[1]))
    inv_indx[word] = interval

In [41]:
inv_indx

defaultdict(list,
            {'usado': [(33, 1),
              (3, 1),
              (8, 1),
              (2, 1),
              (9, 1),
              (27, 1),
              (22, 1),
              (32, 1),
              (17, 1),
              (17, 1),
              (96, 1),
              (12, 1),
              (26, 1),
              (14, 1),
              (6, 1),
              (4, 1),
              (47, 1),
              (4, 1),
              (35, 1),
              (4, 1),
              (17, 1),
              (73, 1),
              (6, 1),
              (66, 1),
              (27, 1),
              (16, 1),
              (64, 1),
              (9, 1),
              (3, 1),
              (14, 1),
              (28, 1),
              (7, 1),
              (24, 1),
              (50, 1),
              (18, 1),
              (12, 1),
              (89, 1),
              (123, 1),
              (76, 1),
              (6, 1),
              (82, 4)],
             'cumaru': [(1107, 2)],
     

In [28]:
import json

In [42]:
with open('inverted_index_with_compression_and_without_steamming.txt', 'w') as outfile:  
    json.dump(inv_indx, outfile)