In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import nltk
import pandas as pd
import seaborn as sns

### Define global constants
SEED = 42

### Getting data, modeling & separating into train/test data
df = pd.read_csv('data/imdb-reviews-pt-br.csv')

classification_column = df['sentiment'].replace(['neg', 'pos'], [0, 1])
df['classification'] = classification_column

### Define util global vars
all_words = ' '.join([text for text in df['text_pt']])
whitespace_tokenizer = nltk.tokenize.WhitespaceTokenizer()
stopwords = nltk.corpus.stopwords.words('portuguese')

In [None]:
def predict_text(text_df, x_column, y_column):
  vectorize = CountVectorizer(lowercase=False, max_features=50)
  bag_of_words = vectorize.fit_transform(text_df[x_column])

  x_train, x_test, y_train, y_test = train_test_split(
    bag_of_words,
    text_df[y_column],
    random_state=SEED
  )

  ### Predict with LogisticRegression
  logistic_regression_model = LogisticRegression()
  logistic_regression_model.fit(x_train, y_train)
  logistic_regression_accuracy = logistic_regression_model.score(x_test, y_test)

  print(logistic_regression_accuracy)

In [None]:
predict_text(df, 'text_pt', 'classification')

In [None]:
### Create a wordcloud
def get_words_wordcloud(text_df, x_column, y_column, y_value):
  all_words_based_on_y = text_df.query(f'{y_column} == "{y_value}"')
  all_words_based_on_y = ' '.join([text for text in all_words_based_on_y[x_column]])

  wordcloud = WordCloud(
    width=800, 
    height=500, 
    max_font_size=110,
    collocations=False
  ).generate(all_words_based_on_y)

  plt.figure(figsize=(10, 7))
  plt.imshow(wordcloud, interpolation='bilinear')
  plt.axis('off')
  plt.show()

In [None]:
get_words_wordcloud(df, 'text_pt', 'sentiment', 'pos')
get_words_wordcloud(df, 'text_pt', 'sentiment', 'neg')

In [None]:
### Tokenize white-spaces, getting words frequency & plot most frequent data
def tokenize_and_plot_most_frequent_data(text_df, x_column, quantity):
  all_words_from_x_column = ' '.join([text for text in text_df[x_column]])

  tokenized_phrases = whitespace_tokenizer.tokenize(all_words_from_x_column)
  words_frequency = nltk.FreqDist(tokenized_phrases)

  words_frequency_df = pd.DataFrame({ 
    'Word': list(words_frequency.keys()), 
    'Frequency': list(words_frequency.values()) 
  })

  n_most_frequent_words = words_frequency_df.nlargest(columns='Frequency', n=quantity)

  plt.figure(figsize=(12, 8))
  ax = sns.barplot(data=n_most_frequent_words, x='Word', y='Frequency', color='gray')
  ax.set(ylabel='Count')
  plt.show()

In [None]:
tokenize_and_plot_most_frequent_data(df, 'text_pt', 10)

In [None]:
### Removes stopwords in all phrases from DataFrame
all_phrases_without_stopwords = []

for opinion in df['text_pt']:
  phrase_words = whitespace_tokenizer.tokenize(opinion)
  phrase_without_stopwords = []

  for word in phrase_words:
    if word not in stopwords:
      phrase_without_stopwords.append(word)

  all_phrases_without_stopwords.append(' '.join(phrase_without_stopwords))

df['filter_01'] = all_phrases_without_stopwords

In [None]:
predict_text(df, 'filter_01', 'classification')

In [None]:
tokenize_and_plot_most_frequent_data(df, 'filter_01', 10)