In [25]:
# Importando as principais bibliotecas
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss, accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
sns.set(style='whitegrid')

In [26]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [27]:
PATH = '/content/'

# Carregar dados
for fname in ['train.csv','test.csv','sample_submission.csv']:
    if not os.path.exists(os.path.join(PATH, fname)):
        print(f'AVISO: {fname} nao encontrado no diretorio: {PATH}')

train = pd.read_csv(os.path.join(PATH, 'train.csv'))
test = pd.read_csv(os.path.join(PATH, 'test.csv'))
sample_sub = pd.read_csv(os.path.join(PATH, 'sample_submission.csv'))

print('Shapes:')
print('train', train.shape)
print('test', test.shape)
print('sample_submission', sample_sub.shape)

# mostrar primeiras linhas para inspeção
train.head()

ParserError: Error tokenizing data. C error: Expected 9 fields in line 972, saw 10


In [None]:
def make_label(row):
    if row.get('winner_model_a',0) == 1:
        return 'a'
    if row.get('winner_model_b',0) == 1:
        return 'b'
    return 'tie'

train['label'] = train.apply(make_label, axis=1)
train['label'].value_counts()

In [None]:
def add_basic_features(df):
    df = df.copy()
    df['prompt_len'] = df['prompt'].astype(str).apply(len)
    df['prompt_words'] = df['prompt'].astype(str).apply(lambda s: len(s.split()))
    df['resp_a_len'] = df['response_a'].astype(str).apply(len)
    df['resp_b_len'] = df['response_b'].astype(str).apply(len)
    df['resp_a_words'] = df['response_a'].astype(str).apply(lambda s: len(s.split()))
    df['resp_b_words'] = df['response_b'].astype(str).apply(lambda s: len(s.split()))
    df['len_diff_ab'] = df['resp_a_len'] - df['resp_b_len']
    df['abs_len_diff_ab'] = df['len_diff_ab'].abs()
    return df

train_f = add_basic_features(train)
test_f = add_basic_features(test)
train_f[['prompt_len','resp_a_len','resp_b_len','len_diff_ab']].head()

In [None]:
# TF-IDF + TruncatedSVD (prompt e respostas concatenadas)
n_svd = 50
tfidf_prompt = TfidfVectorizer(max_features=1000, ngram_range=(1,2), stop_words='english')
tfidf_resp = TfidfVectorizer(max_features=1000, ngram_range=(1,2), stop_words='english')

tfidf_prompt.fit(train_f['prompt'].astype(str).tolist())
tfidf_resp.fit((train_f['response_a'].astype(str) + ' ' + train_f['response_b'].astype(str)).tolist())

svd_prompt = TruncatedSVD(n_components=n_svd, random_state=42)
svd_resp = TruncatedSVD(n_components=n_svd, random_state=42)

prompt_tfidf_train = tfidf_prompt.transform(train_f['prompt'].astype(str))
resp_tfidf_train = tfidf_resp.transform((train_f['response_a'].astype(str) + ' ' + train_f['response_b'].astype(str)))
svd_prompt.fit(prompt_tfidf_train)
svd_resp.fit(resp_tfidf_train)
prompt_svd_train = svd_prompt.transform(prompt_tfidf_train)
resp_svd_train = svd_resp.transform(resp_tfidf_train)

prompt_tfidf_test = tfidf_prompt.transform(test_f['prompt'].astype(str))
resp_tfidf_test = tfidf_resp.transform((test_f['response_a'].astype(str) + ' ' + test_f['response_b'].astype(str)))
prompt_svd_test = svd_prompt.transform(prompt_tfidf_test)
resp_svd_test = svd_resp.transform(resp_tfidf_test)

print('SVD shapes:', prompt_svd_train.shape, resp_svd_train.shape)

In [None]:
# Concatenar features numéricas e vetoriais
num_cols = ['prompt_len','prompt_words','resp_a_len','resp_b_len','resp_a_words','resp_b_words','len_diff_ab','abs_len_diff_ab']
X_num_train = train_f[num_cols].values
X_num_test = test_f[num_cols].values
X_train = np.hstack([X_num_train, prompt_svd_train, resp_svd_train])
X_test = np.hstack([X_num_test, prompt_svd_test, resp_svd_test])
print('X_train shape', X_train.shape)
print('X_test shape', X_test.shape)

<h1> Regressão Logística

In [None]:
# Preparar rótulos e divisão
le = LabelEncoder()
y = le.fit_transform(train_f['label'].values)
print('Classes:', le.classes_)

X_tr, X_val, y_tr, y_val = train_test_split(X_train, y, test_size=0.2, random_state=42, stratify=y)

# Treinar LogisticRegression multinomial (solução simples e rápida)
lr = LogisticRegression(max_iter=2000, multi_class='multinomial', solver='lbfgs')
lr.fit(X_tr, y_tr)

# Prever probabilidades e calcular log loss
proba_val_lr = lr.predict_proba(X_val)
ll_lr = log_loss(y_val, proba_val_lr)
y_pred_lr = lr.predict(X_val)
print('LogisticRegression - Validation log loss:', ll_lr)
print('Accuracy (val) (LR):', accuracy_score(y_val, y_pred_lr))
print(classification_report(y_val, y_pred_lr, target_names=le.classes_))

In [None]:
lr_full = LogisticRegression(max_iter=2000, multi_class='multinomial', solver='lbfgs')
lr_full.fit(X_train, y)

proba_test_lr = lr_full.predict_proba(X_test)

df_proba = pd.DataFrame(proba_test_lr, columns=[f'prob_{c}' for c in le.classes_])
map_cols = {'a':'winner_model_a','b':'winner_model_b','tie':'winner_model_tie'}
sub = pd.DataFrame()
sub['id'] = test['id']
for cls in le.classes_:
    sub[map_cols[cls]] = df_proba[f'prob_{cls}'].values
sums = sub[['winner_model_a','winner_model_b','winner_model_tie']].sum(axis=1)
sub[['winner_model_a','winner_model_b','winner_model_tie']] = sub[['winner_model_a','winner_model_b','winner_model_tie']].div(sums, axis=0)
print('submission_logistic.csv salvo com shape', sub.shape)
sub.head()

In [None]:
# Visualizações para slides: matriz de confusão, histogramas de probabilidade, distribuição de rótulos no teste
from sklearn.metrics import ConfusionMatrixDisplay

conf = pd.crosstab(le.inverse_transform(y_val), le.inverse_transform(y_pred_lr))
plt.figure(figsize=(6,5))
sns.heatmap(conf, annot=True, fmt='d', cmap='Blues')
plt.title('Matriz de confusão (Logistic Regression - validação)')
plt.ylabel('Verdadeiro')
plt.xlabel('Previsto')
plt.tight_layout()
os.makedirs('figs_for_slides', exist_ok=True)
plt.show()

In [None]:
# Histogramas de probabilidades por classe (validação)
proba_df = pd.DataFrame(proba_val_lr, columns=[f'prob_{c}' for c in le.classes_])
proba_df.plot(kind='hist', bins=30, alpha=0.6, figsize=(10,4))
plt.title('Distribuição de probabilidades por classe (validação - LR)')
plt.xlabel('Probabilidade')
plt.tight_layout()
plt.show()

In [None]:
# Distribuição de rótulos previstos no teste (argmax)
test_proba_df = pd.DataFrame(proba_test_lr, columns=[f'prob_{c}' for c in le.classes_])
test_proba_df['pred_label'] = test_proba_df[[f'prob_{c}' for c in le.classes_]].idxmax(axis=1).apply(lambda s: s.replace('prob_',''))
counts = test_proba_df['pred_label'].value_counts()
plt.figure(figsize=(6,4))
sns.barplot(x=counts.index, y=counts.values, palette='muted')
plt.title('Distribuição de rótulos previstos (teste - LR)')
plt.xlabel('Rótulo previsto')
plt.ylabel('Contagem')
plt.tight_layout()
plt.show()

Cálcular features parecidas

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

tfidf_resp_a = TfidfVectorizer(max_features=1000, ngram_range=(1,2), stop_words='english')
tfidf_resp_b = TfidfVectorizer(max_features=1000, ngram_range=(1,2), stop_words='english')


tfidf_resp_a.fit(train_f['response_a'].astype(str).tolist())
resp_a_tfidf_train = tfidf_resp_a.transform(train_f['response_a'].astype(str))
resp_a_tfidf_test = tfidf_resp_a.transform(test_f['response_a'].astype(str))


tfidf_resp_b.fit(train_f['response_b'].astype(str).tolist())
resp_b_tfidf_train = tfidf_resp_b.transform(train_f['response_b'].astype(str))
resp_b_tfidf_test = tfidf_resp_b.transform(test_f['response_b'].astype(str))


svd_resp_a = TruncatedSVD(n_components=n_svd, random_state=42)
svd_resp_b = TruncatedSVD(n_components=n_svd, random_state=42)

#
svd_resp_a.fit(resp_a_tfidf_train)
resp_a_svd_train = svd_resp_a.transform(resp_a_tfidf_train)
resp_a_svd_test = svd_resp_a.transform(resp_a_tfidf_test)

#
svd_resp_b.fit(resp_b_tfidf_train)
resp_b_svd_train = svd_resp_b.transform(resp_b_tfidf_train)
resp_b_svd_test = svd_resp_b.transform(resp_b_tfidf_test)

# 8. CCalculo de coeficiente simiar
def calculate_cosine_similarity(vec1, vec2):
    norm1 = np.linalg.norm(vec1, axis=1)
    norm2 = np.linalg.norm(vec2, axis=1)
    denominator = norm1 * norm2
    denominator[denominator == 0] = 1

    similarity = np.sum(vec1 * vec2, axis=1) / denominator
    similarity[norm1 == 0] = 0
    similarity[norm2 == 0] = 0
    return similarity

sim_p_a_train = calculate_cosine_similarity(prompt_svd_train, resp_a_svd_train)
sim_p_b_train = calculate_cosine_similarity(prompt_svd_train, resp_b_svd_train)
sim_a_b_train = calculate_cosine_similarity(resp_a_svd_train, resp_b_svd_train)

# 9. Calculate cosine similarity features for test_f
sim_p_a_test = calculate_cosine_similarity(prompt_svd_test, resp_a_svd_test)
sim_p_b_test = calculate_cosine_similarity(prompt_svd_test, resp_b_svd_test)
sim_a_b_test = calculate_cosine_similarity(resp_a_svd_test, resp_b_svd_test)

print("Shape of sim_p_a_train:", sim_p_a_train.shape)
print("Shape of sim_p_b_train:", sim_p_b_train.shape)
print("Shape of sim_a_b_train:", sim_a_b_train.shape)
print("Shape of sim_p_a_test:", sim_p_a_test.shape)
print("Shape of sim_p_b_test:", sim_p_b_test.shape)
print("Shape of sim_a_b_test:", sim_a_b_test.shape)

In [None]:
# Concatenar features numéricas e vetoriais
num_cols = ['prompt_len','prompt_words','resp_a_len','resp_b_len','resp_a_words','resp_b_words','len_diff_ab','abs_len_diff_ab']
X_num_train = train_f[num_cols].values
X_num_test = test_f[num_cols].values
X_train = np.hstack([X_num_train, prompt_svd_train, resp_svd_train])
X_test = np.hstack([X_num_test, prompt_svd_test, resp_svd_test])
print('X_train shape', X_train.shape)
print('X_test shape', X_test.shape)

In [None]:
X_train_new = np.hstack([X_train, sim_p_a_train.reshape(-1, 1), sim_p_b_train.reshape(-1, 1), sim_a_b_train.reshape(-1, 1)])
X_test_new = np.hstack([X_test, sim_p_a_test.reshape(-1, 1), sim_p_b_test.reshape(-1, 1), sim_a_b_test.reshape(-1, 1)])

print('Shape of X_train_new:', X_train_new.shape)
print('Shape of X_test_new:', X_test_new.shape)

In [None]:
X_tr_new, X_val_new, y_tr, y_val = train_test_split(X_train_new, y, test_size=0.2, random_state=42, stratify=y)

# treinando regressão com multiplas features
lr_new = LogisticRegression(max_iter=2000, multi_class='multinomial', solver='lbfgs')
lr_new.fit(X_tr_new, y_tr)

# Prevendo novos valores de log de perda com novos dados
proba_val_lr_new = lr_new.predict_proba(X_val_new)
ll_lr_new = log_loss(y_val, proba_val_lr_new)
y_pred_lr_new = lr_new.predict(X_val_new)
print('LogisticRegression (New Features) - Validation log loss:', ll_lr_new)
print('Accuracy (val) (LR New Features):', accuracy_score(y_val, y_pred_lr_new))
print(classification_report(y_val, y_pred_lr_new, target_names=le.classes_))

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay

conf_new = pd.crosstab(le.inverse_transform(y_val), le.inverse_transform(y_pred_lr_new))
plt.figure(figsize=(6,5))
sns.heatmap(conf_new, annot=True, fmt='d', cmap='Blues')
plt.title('Matriz de confusão (LR com Novas Features - validação)')
plt.ylabel('Verdadeiro')
plt.xlabel('Previsto')
plt.tight_layout()
plt.show()

<h1> Rede Neural

In [None]:
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.utils import to_categorical

#colocar as labels
le = LabelEncoder()
y = le.fit_transform(train_f['label'])

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train_new)
X_test_scaled = scaler.transform(X_test_new)

y_one_hot = to_categorical(y)

print('Shape of X_train_scaled:', X_train_scaled.shape)
print('Shape of X_test_scaled:', X_test_scaled.shape)
print('Shape of y_one_hot:', y_one_hot.shape)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

model = Sequential()

model.add(Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)))

model.add(Dense(32, activation='relu'))

model.add(Dense(y_one_hot.shape[1], activation='softmax'))

model.summary()

In [None]:
from tensorflow.keras.optimizers import Adam

#compilar os dados
model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

print("Model compiled successfully with Adam optimizer, categorical_crossentropy loss, and accuracy metric.")

In [None]:
#treinar e ver o valor de perda
history = model.fit(
    X_train_scaled,
    y_one_hot,
    epochs=50,
    batch_size=32,
    validation_split=0.2,
    verbose=1
)

print("Neural network training complete.")

In [None]:
X_tr_nn, X_val_nn, y_tr_nn_one_hot, y_val_nn_one_hot = train_test_split(X_train_scaled, y_one_hot, test_size=0.2, random_state=42, stratify=y_one_hot)

y_val_nn = np.argmax(y_val_nn_one_hot, axis=1)

proba_val_nn = model.predict(X_val_nn)

y_pred_nn = np.argmax(proba_val_nn, axis=1)

ll_nn = log_loss(y_val_nn_one_hot, proba_val_nn)

accuracy_nn = accuracy_score(y_val_nn, y_pred_nn)

print('Neural Network - Validation log loss:', ll_nn)
print('Accuracy (val) (NN):', accuracy_nn)

print('\nClassification Report (NN Validation):')
print(classification_report(y_val_nn, y_pred_nn, target_names=le.classes_))

conf_nn = pd.crosstab(le.inverse_transform(y_val_nn), le.inverse_transform(y_pred_nn))

plt.figure(figsize=(6,5))
sns.heatmap(conf_nn, annot=True, fmt='d', cmap='Blues')
plt.title('Matriz de confusão (Neural Network - validação)')
plt.ylabel('Verdadeiro')
plt.xlabel('Previsto')
plt.tight_layout()
os.makedirs('figs_for_slides', exist_ok=True)
plt.show()

In [None]:
#Utilizar a rede neural treinada para fazer previsões de probabilidade no conjunto de teste (X_test_scaled).
proba_test_nn = model.predict(X_test_scaled)
print('Shape of predicted probabilities for test set:', proba_test_nn.shape)

In [None]:
#criar probabilidade das respostas
df_proba_nn = pd.DataFrame(proba_test_nn, columns=[f'prob_{c}' for c in le.classes_])

#mapear os itens
map_cols = {'a':'winner_model_a','b':'winner_model_b','tie':'winner_model_tie'}

#iniciar nova coluna id
sub_nn = pd.DataFrame()
sub_nn['id'] = test['id']

#adicionar coluna de dataframe
for cls in le.classes_:
    sub_nn[map_cols[cls]] = df_proba_nn[f'prob_{cls}'].values

#normalizar os itens
sums_nn = sub_nn[['winner_model_a','winner_model_b','winner_model_tie']].sum(axis=1)
sub_nn[['winner_model_a','winner_model_b','winner_model_tie']] = sub_nn[['winner_model_a','winner_model_b','winner_model_tie']].div(sums_nn, axis=0)
print('submission_nn.csv salvo com shape', sub_nn.shape)
sub_nn.head()