In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from string import punctuation
from unidecode import unidecode
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import nltk
import pandas as pd
import seaborn as sns

### Define global constants
SEED = 42

### Getting data, modeling & separating into train/test data
df = pd.read_csv('data/imdb-reviews-pt-br.csv')
classification_column = df['sentiment'].replace(['neg', 'pos'], [0, 1])
df['classification'] = classification_column

### Define util global vars/objects
all_words = ' '.join([text for text in df['text_pt']])
whitespace_tokenizer = nltk.tokenize.WhitespaceTokenizer()
punct_tokenizer = nltk.tokenize.WordPunctTokenizer() 

stemmer = nltk.RSLPStemmer()

stopwords = nltk.corpus.stopwords.words('portuguese')
punctuations = [punct for punct in punctuation]
punctuations_and_stopwords = punctuations + stopwords

In [None]:
def predict_text(text_df, x_column, y_column):
  vectorize = CountVectorizer(lowercase=False, max_features=50)
  bag_of_words = vectorize.fit_transform(text_df[x_column])

  x_train, x_test, y_train, y_test = train_test_split(
    bag_of_words,
    text_df[y_column],
    random_state=SEED
  )

  ### Predict with LogisticRegression
  logistic_regression_model = LogisticRegression()
  logistic_regression_model.fit(x_train, y_train)
  logistic_regression_accuracy = logistic_regression_model.score(x_test, y_test)

  print(logistic_regression_accuracy)

In [None]:
### Create a wordcloud
def get_words_wordcloud(text_df, x_column, y_column, y_value):
  all_words_based_on_y = text_df.query(f'{y_column} == "{y_value}"')
  all_words_based_on_y = ' '.join([text for text in all_words_based_on_y[x_column]])

  wordcloud = WordCloud(
    width=800, 
    height=500, 
    max_font_size=110,
    collocations=False
  ).generate(all_words_based_on_y)

  plt.figure(figsize=(10, 7))
  plt.imshow(wordcloud, interpolation='bilinear')
  plt.axis('off')
  plt.show()

In [None]:
### Tokenize white-spaces, getting words frequency & plot most frequent data
def tokenize_and_plot_most_frequent_data(text_df, x_column, quantity):
  all_words_from_x_column = ' '.join([text for text in text_df[x_column]])

  tokenized_phrases = whitespace_tokenizer.tokenize(all_words_from_x_column)
  words_frequency = nltk.FreqDist(tokenized_phrases)

  words_frequency_df = pd.DataFrame({ 
    'Word': list(words_frequency.keys()), 
    'Frequency': list(words_frequency.values()) 
  })

  n_most_frequent_words = words_frequency_df.nlargest(columns='Frequency', n=quantity)

  plt.figure(figsize=(12, 8))
  ax = sns.barplot(data=n_most_frequent_words, x='Word', y='Frequency', color='gray')
  ax.set(ylabel='Count')
  plt.show()

In [None]:
### Removes stopwords in all phrases from DataFrame
all_phrases_without_stopwords = []

for opinion in df['text_pt']:
  phrase_words = whitespace_tokenizer.tokenize(opinion)
  phrase_without_stopwords = []

  for word in phrase_words:
    if word not in stopwords:
      phrase_without_stopwords.append(word)

  all_phrases_without_stopwords.append(' '.join(phrase_without_stopwords))

df['filter_01'] = all_phrases_without_stopwords

In [None]:
### Removes punctuation in all phrases from DataFrame
all_phrases_without_punctuation = []

for opinion in df['filter_01']:
  phrase_words = punct_tokenizer.tokenize(opinion)
  phrase_without_puncts = []

  for word in phrase_words:
    if word not in punctuations_and_stopwords:
      phrase_without_puncts.append(word)

  all_phrases_without_punctuation.append(' '.join(phrase_without_puncts))

df['filter_02'] = all_phrases_without_punctuation

In [None]:
### Removes accents in all phrases from DataFrame
all_phrases_without_accents = [unidecode(word) for word in df['filter_02']]
stopwords_without_accents = [unidecode(stopword) for stopword in punctuations_and_stopwords]

df['filter_03'] = all_phrases_without_accents

all_phrases_without_accents = []

for opinion in df['filter_03']:
  phrase_words = punct_tokenizer.tokenize(opinion)
  phrase_without_accents = []

  for word in phrase_words:
    if word not in punctuations_and_stopwords:
      phrase_without_accents.append(word)

  all_phrases_without_accents.append(' '.join(phrase_without_accents))

df['filter_03'] = all_phrases_without_accents

In [None]:
### Tranform all phrases from DataFrame into lowercase
all_phrases_with_lowercase = []

for opinion in df['filter_03']:
  opinion = opinion.lower()
  
  phrase_words = punct_tokenizer.tokenize(opinion)
  phrase_with_lowercase = []

  for word in phrase_words:
    if word not in stopwords_without_accents:
      phrase_with_lowercase.append(word)

  all_phrases_with_lowercase.append(' '.join(phrase_with_lowercase))

df['filter_04'] = all_phrases_with_lowercase

In [None]:
### Remove all suffixes from DataFrame
punct_tokenize = lambda opinion: punct_tokenizer.tokenize(opinion)
stem_words = lambda opinion: ' '.join([stemmer.stem(word) for word in opinion if len(word) > 2])

all_phrases_without_suffixes = df['filter_04'].apply(punct_tokenize).apply(stem_words)

df['filter_05'] = all_phrases_without_suffixes

In [None]:
### Apply TF-IDF vectorize in DataFrame
tfidf_vectorizer = TfidfVectorizer(lowercase=False, max_features=50)
sparse_words_with_weight = tfidf_vectorizer.fit_transform(df['filter_05'])

x_train, x_test, y_train, y_test = train_test_split(
  sparse_words_with_weight,
  df['classification'],
  random_state=SEED
)

### Predict with LogisticRegression
logistic_regression_model = LogisticRegression()
logistic_regression_model.fit(x_train, y_train)
logistic_regression_accuracy = logistic_regression_model.score(x_test, y_test)

logistic_regression_accuracy

In [None]:
### Apply ngrans in DataFrame
tfidf_vectorizer = TfidfVectorizer(lowercase=False, ngram_range=(1, 2))
tfidf_vector = tfidf_vectorizer.fit_transform(df['filter_05'])

x_train, x_test, y_train, y_test = train_test_split(
  tfidf_vector,
  df['classification'],
  random_state=SEED
)

### Predict with LogisticRegression
logistic_regression_model = LogisticRegression()
logistic_regression_model.fit(x_train, y_train)
logistic_regression_accuracy = logistic_regression_model.score(x_test, y_test)

In [91]:
weights = pd.DataFrame(
  logistic_regression_model.coef_[0].T,
  index=tfidf_vectorizer.get_feature_names()
)

weights.nlargest(10, 0)
weights.nsmallest(10, 0)

Unnamed: 0,0
ruim,-11.787961
pi,-11.350174
horri,-9.719815
terri,-8.419293
nad,-7.58536
chat,-7.522077
parec,-6.656959
nenhum,-6.35156
tent,-5.755468
mal,-5.745732
