# ANALISE EXPLORATORIA E TRATAMENTO DE DADOS
## Dataset: Product Sentiment Classification
### Autor: Pedro Morato Lahoz

In [None]:
# Instalacao e Imports
!pip install -q kagglehub pandas numpy matplotlib seaborn scikit-learn wordcloud

import kagglehub
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import re
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
%matplotlib inline

print("Bibliotecas importadas!")

In [None]:
# Download Dataset
print("Baixando dataset...")
path = kagglehub.dataset_download("akash14/product-sentiment-classification")
print(f"Dataset em: {path}")

In [None]:
# Carregar Dados
import os

files = os.listdir(path)
print(f"Arquivos: {files}")

csv_file = [f for f in files if f.endswith('.csv')][0]
df = pd.read_csv(os.path.join(path, csv_file))

print(f"Dataset carregado!")
print(f"Shape: {df.shape}")
df.head()

In [None]:
# Informacoes Basicas
print("=" * 60)
print("INFORMACOES GERAIS DO DATASET")
print("=" * 60)

print(f"\nDimensoes:")
print(f"   Linhas: {df.shape[0]:,}")
print(f"   Colunas: {df.shape[1]}")

print(f"\nColunas: {df.columns.tolist()}")
print(f"\nTipos de dados:")
print(df.dtypes)

df.info()

In [None]:
# Estatisticas Descritivas
df.describe(include='all')

In [None]:
# Valores Nulos
null_counts = df.isnull().sum()
null_pct = (null_counts / len(df)) * 100

if null_counts.sum() > 0:
    plt.figure(figsize=(10, 6))
    null_counts[null_counts > 0].plot(kind='bar', color='coral')
    plt.title('Valores Nulos por Coluna')
    plt.xlabel('Colunas')
    plt.ylabel('Quantidade')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()
else:
    print("Sem valores nulos!")

In [None]:
# Distribuicao de Sentimentos
sentiment_col = [c for c in df.columns if 'sentiment' in c.lower() or 'label' in c.lower()][0]
print(f"Coluna sentimento: {sentiment_col}")

counts = df[sentiment_col].value_counts()
print(counts)
print((counts / len(df) * 100).round(2))

fig, axes = plt.subplots(1, 2, figsize=(15, 5))

counts.plot(kind='bar', ax=axes[0], color=['#2ecc71', '#e74c3c', '#f39c12'])
axes[0].set_title('Distribuicao de Sentimentos')
axes[0].set_xlabel('Sentimento')
axes[0].set_ylabel('Quantidade')

axes[1].pie(counts, labels=counts.index, autopct='%1.1f%%', 
            colors=['#2ecc71', '#e74c3c', '#f39c12'])
axes[1].set_title('Proporcao de Sentimentos')

plt.tight_layout()
plt.show()

In [None]:
# Analise do Texto
text_col = [c for c in df.columns if any(x in c.lower() for x in ['text', 'review', 'comment'])][0]
print(f"Coluna texto: {text_col}")

df['text_length'] = df[text_col].astype(str).apply(len)
df['word_count'] = df[text_col].astype(str).apply(lambda x: len(x.split()))

print(df[['text_length', 'word_count']].describe())

In [None]:
# Tamanho por Sentimento
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

df.boxplot(column='text_length', by=sentiment_col, ax=axes[0])
axes[0].set_title('Tamanho (caracteres) por Sentimento')

df.boxplot(column='word_count', by=sentiment_col, ax=axes[1])
axes[1].set_title('Numero de Palavras por Sentimento')

plt.suptitle('')
plt.tight_layout()
plt.show()

In [None]:
# Palavras Mais Comuns
def get_top_words(texts, n=20):
    all_text = ' '.join(texts.astype(str))
    words = re.findall(r'\b[a-z]+\b', all_text.lower())
    return Counter(words).most_common(n)

top_words = get_top_words(df[text_col], 30)
print("Top 15 palavras:")
for word, count in top_words[:15]:
    print(f"   {word:15s}: {count:5d}")

words, counts = zip(*top_words[:20])
plt.figure(figsize=(12, 6))
plt.barh(words, counts, color='skyblue')
plt.xlabel('Frequencia')
plt.title('Top 20 Palavras')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

In [None]:
# Word Cloud
all_text = ' '.join(df[text_col].astype(str))

wc = WordCloud(width=1600, height=800, background_color='white', 
               colormap='viridis', max_words=100).generate(all_text)

plt.figure(figsize=(16, 8))
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud Geral')
plt.tight_layout()
plt.show()

In [None]:
# Tratamento de Dados
df_clean = df.copy()
initial_size = len(df_clean)

# Remover nulos
df_clean = df_clean.dropna(subset=[text_col, sentiment_col])
print(f"Removidos {initial_size - len(df_clean)} nulos")

# Remover duplicatas
before = len(df_clean)
df_clean = df_clean.drop_duplicates(subset=[text_col])
print(f"Removidas {before - len(df_clean)} duplicatas")

# Remover textos curtos
before = len(df_clean)
df_clean = df_clean[df_clean['word_count'] >= 3]
print(f"Removidos {before - len(df_clean)} textos curtos")

# Limpar texto
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    text = ' '.join(text.split())
    return text

df_clean['text_clean'] = df_clean[text_col].apply(clean_text)

print(f"\nTamanho final: {len(df_clean):,} linhas")
print(f"Taxa retencao: {len(df_clean)/initial_size*100:.2f}%")

In [None]:
# Salvar Dataset Tratado
df_final = df_clean[[text_col, 'text_clean', sentiment_col]].copy()
df_final.columns = ['text_original', 'text_clean', 'sentiment']

df_final.to_csv('dataset_tratado.csv', index=False)
print(f"Dataset salvo: dataset_tratado.csv")
print(f"Linhas: {len(df_final):,}")

df_final.head(10)