# Senti-Pred: Pipeline Completo de Análise de Sentimentos
## Autor: Pedro Morato Lahoz

---

### Indice:
1. [Configuração Inicial](#config)
2. [Análise Exploratória (EDA)](#eda)
3. [Pré-processamento](#preprocessing)
4. [Modelagem](#modeling)
5. [Avaliação](#evaluation)
6. [Deploy com Docker](#deploy)

---
## 1. Configuração Inicial <a id='config'></a>

In [7]:
# Célula de setup para execução no Google Colab
# - Clona o repositório (se necessário)
# - Instala dependências a partir de requirements.txt
# - Cria pastas esperadas e baixa recursos NLTK
import os
import sys
from pathlib import Path
IN_COLAB = 'google.colab' in sys.modules
REPO = 'PedroM2626/Senti-Pred'
CLONE_DIR = Path('/content/Senti-Pred')

if IN_COLAB:
    if not CLONE_DIR.exists():
        print('Clonando repositório do GitHub...')
        get_ipython().system(f'git clone https://github.com/{REPO}.git {CLONE_DIR}')
    %cd /content/Senti-Pred
    print('Instalando dependências (requirements.txt)...')
    get_ipython().system('pip install -q -r requirements.txt || true')
else:
    print('Não detectado Colab — assumindo execução local. Verifique dependências manualmente.')

# Garantir diretórios usados pelo notebook
os.makedirs('data/raw', exist_ok=True)
os.makedirs('reports/visualizacoes', exist_ok=True)
os.makedirs('reports/metrics', exist_ok=True)
os.makedirs('src/models', exist_ok=True)

# NLTK resources (English)
import nltk
nltk_resources = ['punkt','stopwords','wordnet','omw-1.4','averaged_perceptron_tagger']
for r in nltk_resources:
    try:
        nltk.download(r, quiet=True)
    except Exception:
        pass

print('Setup concluído. IN_COLAB=', IN_COLAB)


Não detectado Colab — assumindo execução local. Verifique dependências manualmente.
Setup concluído. IN_COLAB= False


In [8]:
import os
import re
import json
import time
import warnings
warnings.filterwarnings('ignore')
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, roc_auc_score, roc_curve, precision_recall_curve, average_precision_score
from sklearn.preprocessing import label_binarize
from pathlib import Path
sns.set(style='whitegrid')

# Definir BASE_DIR de forma robusta para notebooks (Colab) e scripts
# - Se estiver no Colab e o repositório foi clonado em /content/Senti-Pred, use esse path
# - Caso contrário, tente usar __file__ (scripts) e, por fim, o cwd
try:
    BASE_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
except Exception:
    # Se existe variável IN_COLAB (definida pela célula de setup), respeite-a
    if 'IN_COLAB' in globals() and IN_COLAB:
        candidate = Path('/content/Senti-Pred')
        if candidate.exists():
            BASE_DIR = str(candidate.resolve())
        else:
            # fallback para diretório atual (normalmente /content quando não clonado)
            BASE_DIR = str(Path.cwd())
    else:
        # ambiente local/interativo sem __file__
        BASE_DIR = os.path.abspath(os.getcwd())

# Garantir que BASE_DIR exista; se não, redefinir para cwd
if not os.path.exists(BASE_DIR):
    BASE_DIR = os.getcwd()

TRAIN_RAW = os.path.join(BASE_DIR, 'data', 'raw', 'twitter_training.csv')
VAL_RAW = os.path.join(BASE_DIR, 'data', 'raw', 'twitter_validation.csv')
VIS_DIR = os.path.join(BASE_DIR, 'reports', 'visualizacoes')
METRICS_DIR = os.path.join(BASE_DIR, 'reports', 'metrics')
MODEL_PATH = os.path.join(BASE_DIR, 'src', 'models', 'sentiment_model.pkl')
os.makedirs(VIS_DIR, exist_ok=True)
os.makedirs(METRICS_DIR, exist_ok=True)
os.makedirs(os.path.dirname(MODEL_PATH), exist_ok=True)
# NLTK resources (English)
nltk_resources = ['punkt', 'stopwords', 'wordnet', 'omw-1.4', 'averaged_perceptron_tagger']
for r in nltk_resources:
    try:
        nltk.download(r, quiet=True)
    except Exception:
        pass
print('[OK] Configuracao inicial pronta — BASE_DIR:', BASE_DIR)


[OK] Configuracao inicial pronta — BASE_DIR: /workspaces/Senti-Pred/notebooks


---
## 2. Análise Exploratória (EDA) <a id='eda'></a>

In [3]:
# Carregar dados (assumindo formato esperado)
if not os.path.exists(TRAIN_RAW) or not os.path.exists(VAL_RAW):
    raise FileNotFoundError(f"Esperado arquivos em '{TRAIN_RAW}' e '{VAL_RAW}' — coloque os CSVs em data/raw/")
cols = ['tweet_id', 'entity', 'sentiment', 'text']
df_train = pd.read_csv(TRAIN_RAW, names=cols, header=None, engine='python', encoding='utf-8')
df_val = pd.read_csv(VAL_RAW, names=cols, header=None, engine='python', encoding='utf-8')
df_train['split'] = 'train'
df_val['split'] = 'validation'
df = pd.concat([df_train, df_val], ignore_index=True)
print(f"Dados carregados: train={len(df_train)} | validation={len(df_val)} | total={len(df)}")
# EDA: text length distribution
text_col = 'text'
df['text_length'] = df[text_col].astype(str).apply(lambda s: len(s.split()))
plt.figure(figsize=(10,5))
sns.histplot(df['text_length'], bins=40, kde=True)
plt.title('Distribuição de comprimento de texto')
plt.xlabel('Número de palavras')
plt.tight_layout()
plt.savefig(os.path.join(VIS_DIR, 'text_length.png'))
plt.close()
# top words (raw)
all_words = ' '.join(df[text_col].astype(str)).lower().split()
top_raw = pd.Series(all_words).value_counts().head(20)
plt.figure(figsize=(12,5))
top_raw.plot(kind='bar')
plt.title('Top words (raw)')
plt.tight_layout()
plt.savefig(os.path.join(VIS_DIR, 'top_words_raw.png'))
plt.close()
print('[OK] EDA concluída — imagens em reports/visualizacoes')

Dados carregados: train=74682 | validation=1000 | total=75682
[OK] EDA concluída — imagens em reports/visualizacoes
[OK] EDA concluída — imagens em reports/visualizacoes


---
## 3. Pré-processamento <a id='preprocessing'></a>

In [4]:
lemmatizer = WordNetLemmatizer()
def clean_text(text):
    if not isinstance(text, str):
        return ''
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'@\w+|#\w+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text
def remove_stopwords_en(text):
    if not isinstance(text, str):
        return ''
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text, language='english')
    filtered = [w for w in tokens if w.lower() not in stop_words]
    return ' '.join(filtered)
def lemmatize_text_en(text):
    if not isinstance(text, str):
        return ''
    tokens = word_tokenize(text, language='english')
    try:
        pos_tags = nltk.pos_tag(tokens)
    except Exception:
        pos_tags = [(t, '') for t in tokens]
    def _get_wordnet_pos(tag):
        if tag.startswith('J'):
            return wordnet.ADJ
        if tag.startswith('V'):
            return wordnet.VERB
        if tag.startswith('N'):
            return wordnet.NOUN
        if tag.startswith('R'):
            return wordnet.ADV
        return wordnet.NOUN
    lemmas = []
    for token, tag in pos_tags:
        wn_tag = _get_wordnet_pos(tag) if tag else wordnet.NOUN
        lemmas.append(lemmatizer.lemmatize(token, wn_tag))
    return ' '.join(lemmas)
print('[OK] Funções de pré-processamento definidas')
# Aplicar pré-processamento
df_train_proc = df_train.copy()
df_val_proc = df_val.copy()
df_train_proc['text_clean'] = df_train_proc['text'].apply(clean_text)
df_train_proc['text_no_stop'] = df_train_proc['text_clean'].apply(remove_stopwords_en)
df_train_proc['text_lemmatized'] = df_train_proc['text_no_stop'].apply(lemmatize_text_en)
df_val_proc['text_clean'] = df_val_proc['text'].apply(clean_text)
df_val_proc['text_no_stop'] = df_val_proc['text_clean'].apply(remove_stopwords_en)
df_val_proc['text_lemmatized'] = df_val_proc['text_no_stop'].apply(lemmatize_text_en)
print('[OK] Pré-processamento aplicado (dados em memória)')

[OK] Funções de pré-processamento definidas
[OK] Pré-processamento aplicado (dados em memória)
[OK] Pré-processamento aplicado (dados em memória)


---
## 4. Modelagem <a id='modeling'></a>

In [5]:
# Preparar conjuntos X/y e remover vazios
X_train = df_train_proc['text_lemmatized'].astype(str)
y_train = df_train_proc['sentiment']
X_val = df_val_proc['text_lemmatized'].astype(str)
y_val = df_val_proc['sentiment']
mask_train = X_train.str.strip().replace('', np.nan).notna()
mask_val = X_val.str.strip().replace('', np.nan).notna()
X_train = X_train[mask_train]; y_train = y_train[mask_train]
X_val = X_val[mask_val]; y_val = y_val[mask_val]
print(f'Treino: {len(X_train)} | Validation: {len(X_val)}')
models = {
    'LogisticRegression': LogisticRegression(max_iter=2000, random_state=42),
    'MultinomialNB': MultinomialNB(),
    'LinearSVC': LinearSVC(max_iter=20000, random_state=42)
}
results = {}
for name, clf in models.items():
    print(f"[MODEL] Treinando {name}...")
    pipe = Pipeline([('tfidf', TfidfVectorizer(max_features=15000, ngram_range=(1,2))), ('clf', clf)])
    t0 = time.time(); pipe.fit(X_train, y_train); t1 = time.time(); train_time = t1 - t0
    t0p = time.time(); preds = pipe.predict(X_val); t1p = time.time(); predict_time = t1p - t0p
    acc = accuracy_score(y_val, preds); f1 = f1_score(y_val, preds, average='macro')
    report = classification_report(y_val, preds, output_dict=True)
    cm = confusion_matrix(y_val, preds)
    classes = np.unique(y_val)
    y_val_b = label_binarize(y_val, classes=classes)
    y_score = None
    try:
        y_score = pipe.predict_proba(X_val)
    except Exception:
        try:
            decision = pipe.decision_function(X_val)
            if decision.ndim == 1:
                decision = np.vstack([-decision, decision]).T
            y_score = decision
        except Exception:
            y_score = None
    roc_auc_macro = None; avg_precision_macro = None
    if y_score is not None and y_score.shape[1] == y_val_b.shape[1]:
        try:
            roc_auc_macro = roc_auc_score(y_val_b, y_score, average='macro', multi_class='ovr')
        except Exception:
            roc_auc_macro = None
        try:
            avg_precision_macro = average_precision_score(y_val_b, y_score, average='macro')
        except Exception:
            avg_precision_macro = None
    results[name] = {
        'pipeline': pipe, 'accuracy': acc, 'f1_macro': f1, 'roc_auc_macro': roc_auc_macro,
        'average_precision_macro': avg_precision_macro, 'train_time_seconds': train_time, 'predict_time_seconds': predict_time,
        'report': report, 'confusion_matrix': cm.tolist(), 'y_score': y_score
    }
    print(f'[RESULT] {name} — Accuracy: {acc:.4f} | F1-macro: {f1:.4f} | ROC-AUC(macro): {str(roc_auc_macro)} | AP(macro): {str(avg_precision_macro)}')
    print(classification_report(y_val, preds))

Treino: 72306 | Validation: 999
[MODEL] Treinando LogisticRegression...
[RESULT] LogisticRegression — Accuracy: 0.8949 | F1-macro: 0.8917 | ROC-AUC(macro): 0.9804640563461514 | AP(macro): 0.9486615472017691
              precision    recall  f1-score   support

  Irrelevant       0.87      0.85      0.86       172
    Negative       0.87      0.94      0.90       265
     Neutral       0.94      0.85      0.89       285
    Positive       0.90      0.92      0.91       277

    accuracy                           0.89       999
   macro avg       0.89      0.89      0.89       999
weighted avg       0.90      0.89      0.89       999

[MODEL] Treinando MultinomialNB...
[RESULT] LogisticRegression — Accuracy: 0.8949 | F1-macro: 0.8917 | ROC-AUC(macro): 0.9804640563461514 | AP(macro): 0.9486615472017691
              precision    recall  f1-score   support

  Irrelevant       0.87      0.85      0.86       172
    Negative       0.87      0.94      0.90       265
     Neutral       0.94  

---
## 5. Avaliação <a id='evaluation'></a>

In [6]:
# Escolher melhor por F1-macro e salvar o pipeline
best = max(results.keys(), key=lambda k: results[k]['f1_macro'])
best_pipeline = results[best]['pipeline']
joblib.dump(best_pipeline, MODEL_PATH)
print(f'[OK] Melhor modelo: {best} salvo em: {MODEL_PATH}')
# Salvar métricas em JSON
metrics_out = {'best_model': best, 'results': {}}
for k in results:
    metrics_out['results'][k] = {
        'accuracy': results[k]['accuracy'],
        'f1_macro': results[k]['f1_macro'],
        'roc_auc_macro': results[k].get('roc_auc_macro'),
        'average_precision_macro': results[k].get('average_precision_macro'),
        'train_time_seconds': results[k].get('train_time_seconds'),
        'predict_time_seconds': results[k].get('predict_time_seconds'),
        'classification_report': results[k]['report'],
        'confusion_matrix': results[k]['confusion_matrix']
    }
with open(os.path.join(METRICS_DIR, 'model_metrics.json'), 'w') as f:
    json.dump(metrics_out, f, indent=2)
print(f"[OK] Métricas salvas em: {os.path.join(METRICS_DIR, 'model_metrics.json')}")
# Gerar gráficos comparativos
classes_all = np.unique(y_val)
y_val_b_all = label_binarize(y_val, classes=classes_all)
# ROC comparativo
plt.figure(figsize=(8,6))
plotted_any = False
for name in results:
    y_score = results[name].get('y_score')
    if y_score is None:
        continue
    try:
        fpr, tpr, _ = roc_curve(y_val_b_all.ravel(), y_score.ravel())
        auc_val = None
        try:
            auc_val = roc_auc_score(y_val_b_all, y_score, average='macro', multi_class='ovr')
        except Exception:
            auc_val = None
        label = f"{name}"
        if auc_val is not None:
            label += f" (AUC={auc_val:.3f})"
        plt.plot(fpr, tpr, lw=2, label=label)
        plotted_any = True
    except Exception:
        continue
if plotted_any:
    plt.plot([0,1],[0,1],'k--', linewidth=0.5)
    plt.xlabel('False Positive Rate'); plt.ylabel('True Positive Rate')
    plt.title('Comparative ROC Curves (all models)')
    plt.legend(loc='lower right'); plt.tight_layout()
    roc_path = os.path.join(VIS_DIR, 'comparison_roc.png')
    plt.savefig(roc_path); plt.close(); print(f'[OK] ROC comparativo salvo em: {roc_path}')
else:
    print('[WARN] Nenhum score disponível para plotting ROC comparativo')
# PR comparativo
plt.figure(figsize=(8,6))
plotted_any = False
for name in results:
    y_score = results[name].get('y_score')
    if y_score is None:
        continue
    try:
        precision, recall, _ = precision_recall_curve(y_val_b_all.ravel(), y_score.ravel())
        ap = None
        try:
            ap = average_precision_score(y_val_b_all, y_score, average='macro')
        except Exception:
            ap = None
        label = f"{name}"
        if ap is not None:
            label += f" (AP={ap:.3f})"
        plt.plot(recall, precision, lw=2, label=label)
        plotted_any = True
    except Exception:
        continue
if plotted_any:
    plt.xlabel('Recall'); plt.ylabel('Precision')
    plt.title('Comparative Precision-Recall Curves (all models)')
    plt.legend(loc='lower left'); plt.tight_layout()
    pr_path = os.path.join(VIS_DIR, 'comparison_pr.png')
    plt.savefig(pr_path); plt.close(); print(f'[OK] Precision-Recall comparativo salvo em: {pr_path}')
else:
    print('[WARN] Nenhum score disponível para plotting Precision-Recall comparativo')
# Confusion matrices lado a lado
model_names = list(results.keys())
cms = [np.array(results[nm]['confusion_matrix']) for nm in model_names]
if len(cms) > 0:
    vmax = max(cm.max() for cm in cms)
    fig, axes = plt.subplots(1, len(model_names), figsize=(6 * len(model_names), 5))
    if len(model_names) == 1:
        axes = [axes]
    for ax, nm, cm in zip(axes, model_names, cms):
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=classes_all, yticklabels=classes_all, vmin=0, vmax=vmax, ax=ax)
        ax.set_title(f'Confusion — {nm}'); ax.set_xlabel('Predito'); ax.set_ylabel('Real')
    plt.tight_layout(); cm_path = os.path.join(VIS_DIR, 'comparison_confusion_matrices.png'); plt.savefig(cm_path); plt.close(); print(f'[OK] Matrizes de confusão comparativas salvas em: {cm_path}')
else:
    print('[WARN] Nenhuma matriz de confusão disponível para plotagem')

[OK] Melhor modelo: LinearSVC salvo em: /workspaces/Senti-Pred/src/models/sentiment_model.pkl
[OK] Métricas salvas em: /workspaces/Senti-Pred/reports/metrics/model_metrics.json
[OK] ROC comparativo salvo em: /workspaces/Senti-Pred/reports/visualizacoes/comparison_roc.png
[OK] Precision-Recall comparativo salvo em: /workspaces/Senti-Pred/reports/visualizacoes/comparison_pr.png
[OK] Precision-Recall comparativo salvo em: /workspaces/Senti-Pred/reports/visualizacoes/comparison_pr.png
[OK] Matrizes de confusão comparativas salvas em: /workspaces/Senti-Pred/reports/visualizacoes/comparison_confusion_matrices.png
[OK] Matrizes de confusão comparativas salvas em: /workspaces/Senti-Pred/reports/visualizacoes/comparison_confusion_matrices.png


---
## 6. Deploy com Docker <a id='deploy'></a>
(Se desejar, execute as células abaixo para gerar artefatos Docker/README para deploy)

In [None]:
# Opcional: criar Dockerfile e docker-compose para API (simplificado)
dockerfile_content = '''
FROM python:3.9-slim
WORKDIR /app
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
COPY . .
EXPOSE 8000
CMD ["python", "manage.py", "runserver", "0.0.0.0:8000"]
'''
with open(os.path.join(BASE_DIR, 'src', 'api', 'Dockerfile'), 'w') as f:
    f.write(dockerfile_content)
print('[OK] Dockerfile criado (src/api/Dockerfile)')