In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords

In [9]:
# Link dos dados: https://www.kaggle.com/datasets/ymanojkumar023/kumarmanoj-bag-of-words-meets-bags-of-popcorn?select=labeledTrainData.tsv
# Importando os dados

dados = pd.read_csv("labeledTrainData.tsv", delimiter="\t", quoting=3)
dados.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [10]:
# Verificando as dimensões do Dataset

dados.shape

(25000, 3)

In [11]:
# Baixando as stopwords

nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Rodolfo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
# Limpeza dos dados

def limpeza(review):
    review = BeautifulSoup(review).get_text()
    review = re.sub("[^a-zA-Z]", " ", review)
    review = review.lower()
    review = review.split()
    swords = set(stopwords.words("english"))
    review = [w for w in review if w not in swords]
    return(" ".join(review))

In [13]:
dados_limpos = []
for t in range(len(dados["review"])):
    if (t+1)%1000 == 0:
        print("Número de Reviews Processados: ", t+1)
    dados_limpos.append(limpeza(dados["review"][t]))


  review = BeautifulSoup(review).get_text()


Número de Reviews Processados:  1000
Número de Reviews Processados:  2000
Número de Reviews Processados:  3000
Número de Reviews Processados:  4000
Número de Reviews Processados:  5000
Número de Reviews Processados:  6000
Número de Reviews Processados:  7000
Número de Reviews Processados:  8000
Número de Reviews Processados:  9000
Número de Reviews Processados:  10000
Número de Reviews Processados:  11000
Número de Reviews Processados:  12000
Número de Reviews Processados:  13000
Número de Reviews Processados:  14000
Número de Reviews Processados:  15000
Número de Reviews Processados:  16000
Número de Reviews Processados:  17000
Número de Reviews Processados:  18000
Número de Reviews Processados:  19000
Número de Reviews Processados:  20000
Número de Reviews Processados:  21000
Número de Reviews Processados:  22000
Número de Reviews Processados:  23000
Número de Reviews Processados:  24000
Número de Reviews Processados:  25000


In [14]:
# Train and Test

x = dados_limpos
y = np.array(dados["sentiment"])

train_x, test_x, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [15]:
# Criando a bag of Words

vectorizer = CountVectorizer()
train_x = vectorizer.fit_transform(train_x)

In [16]:
train_x

<20000x67629 sparse matrix of type '<class 'numpy.int64'>'
	with 1951230 stored elements in Compressed Sparse Row format>

In [17]:
train_x = train_x.toarray()

In [18]:
train_x.shape, y_train.shape

((20000, 67629), (20000,))

In [19]:
# Construindo a Random Forest Model

RDM = RandomForestClassifier(n_estimators=100, random_state=84)
RDM.fit(train_x, y_train)

In [20]:
# Utilizando nos dados de teste

teste_X = vectorizer.transform(test_x)
teste_X

<5000x67629 sparse matrix of type '<class 'numpy.int64'>'
	with 485747 stored elements in Compressed Sparse Row format>

In [21]:
teste_X = teste_X.toarray()

In [22]:
# Predizendo

teste_predicao = RDM.predict(teste_X)
acc = roc_auc_score(y_test, teste_predicao)

In [23]:
print("Acurácia: % ", acc*100)

Acurácia: %  86.10254739574424
