**Nesse notebook, vou testar com BoW. Também irei tester alguns novos aproaches.**

## *Import Libraries*

In [1]:
import gc
import re
import operator 
import random
import time

import numpy as np
import pandas as pd

from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

import matplotlib.pyplot as plt
from subprocess import check_output



Como o arquivo de treino é muito grando, vamos ter que limitar a quantidade de linhas utilizadas, pois só tenho 4gb ram. Também será necessário utilizar o encoder UTF-8, pois os textos são em português e espanhol

In [2]:
start_time = time.time()
p = 0.1
train = pd.read_csv('train.csv', header = 0, encoding = 'utf-8', skiprows = lambda i: i>0 and random.random() >p)
test = pd.read_csv('test.csv', header = 0, encoding = 'utf-8')

train.drop(['language','label_quality'], axis = 'columns', inplace = True) 
test.drop(['language'], axis = 'columns', inplace = True) 

print("Train shape : ",train.shape)
print("Test shape : ",test.shape)

elapsed_time = int(time.time() - start_time)
print('{:02d}:{:02d}:{:02d}'.format(elapsed_time // 3600, (elapsed_time % 3600 // 60), elapsed_time % 60))


Train shape :  (1999645, 2)
Test shape :  (246955, 2)
00:00:13


In [3]:
test.head()

Unnamed: 0,id,title
0,0,Kit Maternidade Bolsa-mala Baby/bebe Vinho Men...
1,1,Trocador De Fraldas Fisher Price Feminino Rosa...
2,2,Motor Ventoinha - Fiat Idea / Palio 1.8 - A 04...
3,3,Amortecedor Mola Batente D Dir New Civic 14 - ...
4,4,Cadeirinha De Carro Bebê Princesa Princess 9 A...


In [4]:
categorias = train.filter(['category'], axis = 1)
num = test.filter(['id'], axis = 1)
num.head()

Unnamed: 0,id
0,0
1,1
2,2
3,3
4,4


In [5]:
# Como nas competições é ideal juntar os datasets, iremos fazer isso agora:
train.drop(['category'], axis = 'columns', inplace = True) 
test.drop(['id'], axis = 'columns', inplace = True) 

df = pd.concat([train, test], axis=0)
del(train, test)
gc.collect()

7

In [6]:
df.shape

(2246600, 1)

## Embeddings and Preprocessing Text

**Aqui iremos criar as embbedings, palavras chaves para o nosso modelo treinar. É uma espécie de dicionário que deve conter pelo menos 95% do texto.**

In [7]:
# Algumas bibliotecas que temos que importar para poder realizar essa parte
# esses que estão desmarcados já foram importados.
#import pandas as pd
from tqdm import tqdm
tqdm.pandas()
#import operator
#import re
#from gensim.models import KeyedVectors

Nesse próximo passo iremos criar nosso vocabulário de teste. Essa função irá percorrer todo o texto e contar quantas ocorrências de cada palavra temos.

In [8]:
def build_vocab(sentences, verbose =  True):
    vocab = {}
    for sentence in tqdm(sentences, disable = (not verbose)):
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

Vamos montar o dicionário. A função *pregress_apply* permite que acompanhemos o tempo decorrido sobre um rotina

In [9]:
sentences = df["title"].progress_apply(lambda x: x.split()).values
vocab = build_vocab(sentences)
print({k: vocab[k] for k in list(vocab)[:5]})

100%|████████████████████████████████████████████████████████████████████| 2246600/2246600 [00:04<00:00, 479579.28it/s]
100%|████████████████████████████████████████████████████████████████████| 2246600/2246600 [00:04<00:00, 484492.48it/s]


{'Hidrolavadora': 1517, 'Lavor': 51, 'One': 5525, '120': 4692, 'Bar': 4316}


In [10]:
#aqui é a hora de carregar o modelo já treinado de dados, no caso do FastText
#import fastText
#pd.read_csv("../data_folder/data.csv")
embeddings_index = KeyedVectors.load_word2vec_format('cbow_s1000.txt')

In [10]:
#https://github.com/dccuchile/spanish-word-embeddings
#trained_words = 'crawl-300d-2M.vec'
embeddings_index2 = KeyedVectors.load_word2vec_format('SBW-vectors-300-min5.txt')

Agora, com essa função, vamos checar a intersecção entre nosso vocabulário e as embeddings. Ela vai gerar uma lista oov(output  out of vocabulary).

In [11]:
def check_coverage(vocab,embeddings_index):
    known_words = {}
    unknown_words = {}
    nb_known_words = 0
    nb_unknown_words = 0
    for word in tqdm(vocab):
        try:
            known_words[word] = embeddings_index[word]
            nb_known_words += vocab[word]
        except:

            unknown_words[word] = vocab[word]
            nb_unknown_words += vocab[word]
            pass

    print('Found embeddings for {:.3%} of vocab'.format(len(known_words) / len(vocab)))
    print('Found embeddings for  {:.3%} of all text'.format(nb_known_words / (nb_known_words + nb_unknown_words)))
    sorted_x = sorted(unknown_words.items(), key=operator.itemgetter(1))[::-1]

    return sorted_x

In [12]:
oov = check_coverage(vocab,embeddings_index)
gc.collect()

100%|██████████████████████████████████████████████████████████████████████| 909210/909210 [00:04<00:00, 198879.89it/s]


Found embeddings for 0.220% of vocab
Found embeddings for  3.302% of all text


0

In [13]:
def clean_text(x):

    x = str(x)
    for punct in "/-'":
        x = x.replace(punct, ' ')
    for punct in '&':
        x = x.replace(punct, f' {punct} ')
    for punct in '?!.,"#$%\'()*+-/:;<=>@[\\]^_`{|}~' + '“”’':
        x = x.replace(punct, '')
    return x

In [14]:
df["title"] = df["title"].progress_apply(lambda x: clean_text(x))
sentences = df["title"].apply(lambda x: x.split())
vocab = build_vocab(sentences)

100%|████████████████████████████████████████████████████████████████████| 2246600/2246600 [00:15<00:00, 146197.83it/s]
100%|████████████████████████████████████████████████████████████████████| 2246600/2246600 [00:04<00:00, 454390.36it/s]


In [15]:
oov = check_coverage(vocab,embeddings_index)

100%|██████████████████████████████████████████████████████████████████████| 650637/650637 [00:04<00:00, 154942.12it/s]


Found embeddings for 3.339% of vocab
Found embeddings for  1.635% of all text


In [16]:
df['title'] = df['title'].apply(lambda x: x.lower())
gc.collect()

0

In [17]:
# remove 1 character terms (length == 1)
df['title'] = df['title'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>1]))

In [18]:
vocab = build_vocab(df['title'])
oov = check_coverage(vocab,embeddings_index)

100%|████████████████████████████████████████████████████████████████████| 2246600/2246600 [00:09<00:00, 230988.01it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 106/106 [00:00<00:00, 51386.53it/s]


Found embeddings for 83.962% of vocab
Found embeddings for  81.145% of all text


In [19]:
replace_1 = re.compile("(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])|(\*)|(\d+)|(\+)|(\%)|(\&)|(\/)|(\-)")
                       
def clean_reviews(reviews):
    reviews = [replace_1.sub("", line.lower()) for line in reviews]
    return reviews
                       
df['title'] = clean_reviews(df['title'])

In [20]:
vocab = build_vocab(df['title'])
oov = check_coverage(vocab, embeddings_index)

100%|████████████████████████████████████████████████████████████████████| 2246600/2246600 [00:08<00:00, 256929.52it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 96/96 [00:00<00:00, 98762.13it/s]


Found embeddings for 91.667% of vocab
Found embeddings for  85.238% of all text


In [21]:
sentences = df["title"].progress_apply(lambda x: x.split())
to_remove = ['à',' ','\xa0','\x9d','\x81','\x7f','\x8d','\x90','°','\x9d','´','¡','®','¿','¨','×','»','·','¦','«','±','§','¢','£','\xad','\x81']
sentences = [[word for word in sentence if not word in to_remove] for sentence in tqdm(sentences)]
vocab = build_vocab(sentences)

100%|████████████████████████████████████████████████████████████████████| 1247331/1247331 [00:02<00:00, 578600.30it/s]
100%|████████████████████████████████████████████████████████████████████| 1247331/1247331 [00:03<00:00, 319786.83it/s]
100%|████████████████████████████████████████████████████████████████████| 1247331/1247331 [00:02<00:00, 591471.42it/s]


In [None]:
def _get_mispell(mispell_dict):
    mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
    return mispell_dict, mispell_re


mispell_dict = {'pçs':'peças',
                'soporte':'suporte',
                'delantero':'dianteiro',
                'cargador':'carregador',
                'lampara':'lâmpada',
                'talle':'tamanho',
                'cuotas':'dívidas',
                'embrague':'embreagem',
                'plegable':'dobrável',
                'inoxidable':'inoxidável',
                'impecable':'impecável',
                'accesorios':'acessórios',
                'inflable':'inflável',
                'estuche':'kit',
                'griferia':'torneira',
                'heladera': 'refrigerador',
                'compresor': 'compressor',
                'cubre': 'cobre',
                'silicona': 'silicone',
                'impresora': 'impressora'

                }
mispellings, mispellings_re = _get_mispell(mispell_dict)

def replace_typical_misspell(text):
    def replace(match):
        return mispellings[match.group(0)]

    return mispellings_re.sub(replace, text)

In [None]:
df["title"] = df["title"].apply(lambda x: replace_typical_misspell(x))

In [None]:
oov = check_coverage(vocab, embeddings_index2)
oov[:20]

Podemos fazer uma limpeza maior e também processar um embedding do dicionário em espanhol. Mas por enquanto vamos ver como estamos fazendo.

In [21]:
#Primeiro vamos fazer a divisão do treino e teste de volta
train = df.iloc[:1999645,:]
test = df.iloc[1999645:,:]

print(train.shape)
print(test.shape)

(1999645, 1)
(246955, 1)


In [22]:
del(df)
gc.collect()

9

In [23]:
#Agora vamos juntar novamente no treino a coluna de categoria e o id para teste
treino = pd.concat([train, categorias], axis=1)
treino.head()

Unnamed: 0,title,category
0,hidrolavadora lavor one bar w bomba aluminio ...,ELECTRIC_PRESSURE_WASHERS
1,painel para tv polegadas quirino branco canela,TV_STORAGE_UNITS
2,ers oficial nacional baloncesto asociación inc...,LED_STAGE_LIGHTS
3,carenagem tampa lateral nxr bros vermelho,MOTORCYCLE_CLUTCH_COVERS
4,carregador bateria original câmera sony bc trv...,CAMERA_CHARGERS


In [24]:
teste = pd.concat([num, test], axis=1)
teste.head()

Unnamed: 0,id,title
0,0,kit maternidade bolsa mala baby bebe vinho men...
1,1,trocador de fraldas fisher price feminino rosa...
2,2,motor ventoinha fiat idea palio
3,3,amortecedor mola batente dir new civic
4,4,cadeirinha de carro bebê princesa princess kgs


In [25]:
del(train)
del(test)
gc.collect()

7

Temos que convertar as strings para float, para poder alimentar o modelo. Iremos utilizar primeiro o BoW

In [26]:
stop_words = open('stopwords_pt_es.txt', encoding = 'utf-8').read().splitlines()

bow = CountVectorizer(binary=False, min_df=5, max_df=1.0, ngram_range=(1,2), stop_words = stop_words)
bow_train = bow.fit_transform(treino['title'])

  'stop_words.' % sorted(inconsistent))


In [27]:
tf = TfidfTransformer(use_idf=False).fit(bow_train)
tf_train = tf.transform(bow_train)

In [28]:
from sklearn.linear_model import LogisticRegression

In [29]:
lr = LogisticRegression(C = 10, random_state = 42)

In [30]:
# train model on bag-of-words features
start_time = time.time()
lr.fit(tf_train, treino['category'])
elapsed_time = int(time.time() - start_time)
print('{:02d}:{:02d}:{:02d}'.format(elapsed_time // 3600, (elapsed_time % 3600 // 60), elapsed_time % 60))



21:22:05


In [31]:
bow_test = bow.transform(teste['title'])
tft = TfidfTransformer(use_idf=False).fit(bow_test)
tf_test = tft.transform(bow_test)

In [32]:
#make predictions on validation set
bow_test_preds = lr.predict(tf_test)

In [33]:
submission_df = pd.read_csv('sample_submission.csv', header = 0, usecols = ['id'])

In [34]:
submission_df['category'] = bow_test_preds

In [35]:
submission_df.to_csv("submission_lrfinal.csv", index=False)

Se quiser saber se o computador roda tensorflow
from tensorflow.python.client import device_lib
def get_available_devices():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos]
print(get_available_devices()) 

import tensorflow as tf
print(tf.__version__)