In [6]:
import pandas as pd
import numpy as np
import re
import contractions
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from unidecode import unidecode
from googletrans import Translator
from string import punctuation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from collections import Counter
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

In [8]:
df = pd.read_csv('dataset_es_train.csv')
df.head()

Unnamed: 0,review_id,product_id,reviewer_id,stars,review_body,review_title,language,product_category
0,es_0491108,product_es_0296024,reviewer_es_0999081,1,Nada bueno se me fue ka pantalla en menos de 8...,television Nevir,es,electronics
1,es_0869872,product_es_0922286,reviewer_es_0216771,1,"Horrible, nos tuvimos que comprar otro porque ...",Dinero tirado a la basura con esta compra,es,electronics
2,es_0811721,product_es_0474543,reviewer_es_0929213,1,Te obligan a comprar dos unidades y te llega s...,solo llega una unidad cuando te obligan a comp...,es,drugstore
3,es_0359921,product_es_0656090,reviewer_es_0224702,1,"No entro en descalificar al vendedor, solo pue...",PRODUCTO NO RECIBIDO.,es,wireless
4,es_0068940,product_es_0662544,reviewer_es_0224827,1,Llega tarde y co la talla equivocada,Devuelto,es,shoes


In [9]:
df.shape

(200000, 8)

In [10]:
sample = df.sample(1000)
sample.reset_index(inplace=True, drop=True)

In [11]:
data = sample[['review_body','stars']]

In [14]:
def lang_trans(data):
    translator = Translator()
    translated_text = translator.translate(data, dest='en')
    return translated_text.text

In [15]:
data['translated_review'] = data['review_body'].apply(lang_trans)

In [17]:
data = data[['translated_review','stars']]
data.head(3)

Unnamed: 0,translated_review,stars
0,The glasses are not of very good quality. When...,2
1,"Simple, but cheap",5
2,"As for the frame, the day you said arrived, du...",2


In [18]:
# remove newlines
def remove_lines(data):
    clean_text = data.replace('\n',' ').replace('\\n',' ').replace('\t',' ')
    return clean_text

# Contraction mapping
def expand_text(data):
    exp = contractions.fix(data)
    return exp

# Handle accented char
def Handle_accented_chars(data):
    text = unidecode(data)
    return text

#Tokenize
def tokenizer(data):
    tokens = word_tokenize(data)
    return tokens

# Remove stopwords
stopwords_list = stopwords.words('english')
stopwords_list.remove('not')
stopwords_list.remove('no')
stopwords_list.remove('nor')
def remove_stopwords(data):
    filtered_text = [word.lower() for word in data if word.lower() not in stopwords_list]
    return filtered_text

# Clean data
def clean_data(data):
    cleaning = [word for word in data if word not in punctuation and len(word)>2 and word.isalpha()]
    return cleaning

# Lemmatization
def lemmmatize_data(data):
    lemmatizer = WordNetLemmatizer()
    lemmatized_text = [lemmatizer.lemmatize(word) for word in data]
    return lemmatized_text

def joining_words(data):
    return ' '.join(data)

In [19]:
def target_column(data):
    if (data==1 or data==2):
        return 0
    elif data==3:
        return 1
    elif (data==4 or data==5):
        return 2

In [20]:
data['target'] = data['stars'].apply(target_column)
data.head(2)

Unnamed: 0,translated_review,stars,target
0,The glasses are not of very good quality. When...,2,0
1,"Simple, but cheap",5,2


In [21]:
x_train,x_test,y_train,y_test = train_test_split(data.translated_review,data.target,test_size=0.25,random_state=42,stratify=data.target)

In [22]:
clean_x_train = x_train.apply(remove_lines)
clean_x_train = clean_x_train.apply(expand_text)
clean_x_train = clean_x_train.apply(Handle_accented_chars)
clean_x_train = clean_x_train.apply(tokenizer)
clean_x_train = clean_x_train.apply(remove_stopwords)
clean_x_train = clean_x_train.apply(clean_data)
clean_x_train = clean_x_train.apply(lemmmatize_data)
clean_x_train = clean_x_train.apply(joining_words)

In [23]:
clean_x_train

369                                    small child woman
732    sticker beautiful instruction say stuck smooth...
248    switch promise reasonable price control blind ...
58     product arrived original packaging not bad cop...
394    backpack fit well pretty interior compartment ...
                             ...                        
639           although not bad not expected case not bad
909    pretty bought gift child delighted bracelet sa...
283    not responsible product arrived poor condition...
548                              good everything problem
30                                 take gym fine problem
Name: translated_review, Length: 750, dtype: object

In [24]:
count = CountVectorizer(max_df=0.95,max_features=1000)
count_train_val = count.fit_transform(clean_x_train)

In [25]:
mnb_count = MultinomialNB()
mnb_count.fit(count_train_val.A,y_train)

In [26]:
import pickle
pickle.dump(count, open('countVec.pkl', "wb"))
import pickle
pickle.dump(mnb_count, open('model_mnb.pkl', "wb"))