In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from tqdm import tqdm

In [None]:
def special_fill_nan(df:pd.DataFrame):
    columns = [column for column in df.columns if (column !='id' and column != 'Personality')]

    for col in columns:
        if df[col].dropna().isin(["Yes", "No"]).all(): # Converts "Yes" and "No" columns to numerical booleans
            df[col] = df[col].map({"Yes": 1, "No": 0})
            
    
    df_dict = df.to_dict(orient="records")

    for i in tqdm(range(len(df_dict))):
        nan_columns = [k for k, v in df_dict[i].items() if pd.isna(v)]
        base_columns = [k for k, v in df_dict[i].items() if (not pd.isna(v) and k!='id' and k != 'Personality')]

        for nan_column in nan_columns: # For every empty column in a row
            accumulate = 0
            for base_column in base_columns: # Takes the value of every non-empty column, filters by it and takes the mode of the empty column, then takes the mean
                mode = df.query(f"{base_column} == {df_dict[i][base_column]}")[nan_column].mode()[0]

                accumulate += mode

            mean = round(accumulate/len(base_columns))

            df_dict[i][nan_column] = int(mean)

    
    new_df = pd.DataFrame(df_dict)
    new_df[columns] = new_df[columns].astype(int)

    return new_df


def standard_fill_nan(df:pd.DataFrame, strategy="mode"):
    columns = [column for column in df.columns if (column !='id' and column != 'Personality')]

    for col in columns:
        if df[col].dropna().isin(["Yes", "No"]).all(): # Converts "Yes" and "No" columns to numerical booleans
            df[col] = df[col].map({"Yes": 1, "No": 0})

        if strategy == "mode":    
            mode = df[col].mode()[0]
            fill_value = round(mode)
        elif strategy == "mean":
            mean = df[col].mean()
            fill_value = round(mean)

        df[col] = df[col].fillna(fill_value).round().astype(int)

    return df

def preprocess(df:pd.DataFrame, strategy="mode"):
    if strategy == "mean" or strategy=="mode":
        return standard_fill_nan(df, strategy)
    elif strategy == "special":
        return special_fill_nan(df)

In [None]:
original_df = pd.read_csv("../data/C1/train.csv")
original_df['label'] = [1 if x == 'Extrovert' else 0 for x in original_df['Personality']]
original_df.head()

In [None]:
original_df.info()

In [None]:
def cut_dataframe(data: pd.DataFrame, drop_columns: list[str]) -> pd.DataFrame:
    return data.drop(columns=drop_columns)

def split_data(data: pd.DataFrame, test_size: float = 0.2):
    x = data.drop(columns=['label'])
    y = data['label']

    return train_test_split(x, y, test_size=test_size, random_state=seed)

def test_random_forest(model: RandomForestClassifier, x_test, y_test):
    y_pred = model.predict(x_test)
    return accuracy_score(y_test, y_pred)

def measure_method(
        data: pd.DataFrame, 
        strategy: str,
        n_estimators: int = 100, 
        smote: bool = False
    ):
    
    processed_data = preprocess(data, strategy)
    x = processed_data.drop(columns=['label'])
    y = processed_data['label']

    if smote:
        model = Pipeline([
            ('smote', SMOTE()),
            ('rf', RandomForestClassifier(n_estimators=n_estimators))
        ])
    else:
        model = RandomForestClassifier(n_estimators=n_estimators)


    kf = StratifiedKFold(n_splits=5, shuffle=True)
    scores = cross_validate(model, x, y, cv=kf, scoring=['f1', 'accuracy'], return_train_score=False)
    #pr_scores = cross_val_predict(model, x, y, cv=kf, method='predict_proba')[:, 1]
    #precision, recall, _ = precision_recall_curve(y, pr_scores)

    conf_matrices = np.zeros((2, 2))

    # Iterate through the folds
    for train_index, test_index in kf.split(x, y):
        X_train, X_test = x.iloc[train_index], x.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        if smote:
            X_train_resampled, y_train_resampled = SMOTE().fit_resample(X_train, y_train)
            model.fit(X_train_resampled, y_train_resampled)
        else:
            model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        # Compute confusion matrix for this fold
        cm = confusion_matrix(y_test, y_pred)
        conf_matrices += cm

    return scores, conf_matrices/5

## Dataset com Drained_after_socializing

In [None]:
df = cut_dataframe(original_df, ['id', 'Personality'])
df.head()

### Mode Fill

In [None]:
scores, cm = measure_method(df.copy(), strategy='mode')
print('accuracy:', np.mean(scores['test_accuracy']))
print('f1:', np.mean(scores['test_f1']))

disp = ConfusionMatrixDisplay(confusion_matrix=cm.astype(int), display_labels=["Introvert", "Extrovert"])
disp.plot()
plt.show()

### Mean Fill

In [None]:
scores, cm = measure_method(df.copy(), strategy='mean')
print('accuracy:', np.mean(scores['test_accuracy']))
print('f1:', np.mean(scores['test_f1']))

disp = ConfusionMatrixDisplay(confusion_matrix=cm.astype(int), display_labels=["Introvert", "Extrovert"])
disp.plot()
plt.show()

### Special Fill

In [None]:
scores, cm = measure_method(df.copy(), strategy='special')
print('accuracy:', np.mean(scores['test_accuracy']))
print('f1:', np.mean(scores['test_f1']))

disp = ConfusionMatrixDisplay(confusion_matrix=cm.astype(int), display_labels=["Introvert", "Extrovert"])
disp.plot()
plt.show()

## Dataset sem Drained_after_socializing

In [None]:
df_cut = cut_dataframe(df, ['Drained_after_socializing'])

In [None]:
df_cut.head()

### Mode Fill

In [None]:
scores, cm = measure_method(df.copy(), strategy='mode')
print('accuracy:', np.mean(scores['test_accuracy']))
print('f1:', np.mean(scores['test_f1']))

disp = ConfusionMatrixDisplay(confusion_matrix=cm.astype(int), display_labels=["Introvert", "Extrovert"])
disp.plot()
plt.show()

### Mean Fill

In [None]:
scores, cm = measure_method(df.copy(), strategy='mean')
print('accuracy:', np.mean(scores['test_accuracy']))
print('f1:', np.mean(scores['test_f1']))

disp = ConfusionMatrixDisplay(confusion_matrix=cm.astype(int), display_labels=["Introvert", "Extrovert"])
disp.plot()
plt.show()

### Special Fill

In [None]:
scores, cm = measure_method(df.copy(), strategy='special')
print('accuracy:', np.mean(scores['test_accuracy']))
print('f1:', np.mean(scores['test_f1']))

disp = ConfusionMatrixDisplay(confusion_matrix=cm.astype(int), display_labels=["Introvert", "Extrovert"])
disp.plot()
plt.show()

## Treino com SMOTE

### Mode Fill

In [None]:
scores, cm = measure_method(df.copy(), strategy='mode', smote=True)
print('accuracy:', np.mean(scores['test_accuracy']))
print('f1:', np.mean(scores['test_f1']))

disp = ConfusionMatrixDisplay(confusion_matrix=cm.astype(int), display_labels=["Introvert", "Extrovert"])
disp.plot()
plt.show()

### Mean Fill

In [None]:
scores, cm = measure_method(df.copy(), strategy='mean', smote=True)
print('accuracy:', np.mean(scores['test_accuracy']))
print('f1:', np.mean(scores['test_f1']))

disp = ConfusionMatrixDisplay(confusion_matrix=cm.astype(int), display_labels=["Introvert", "Extrovert"])
disp.plot()
plt.show()

### Special Fill

In [None]:
scores, cm = measure_method(df.copy(), strategy='special', smote=True)
print('accuracy:', np.mean(scores['test_accuracy']))
print('f1:', np.mean(scores['test_f1']))

disp = ConfusionMatrixDisplay(confusion_matrix=cm.astype(int), display_labels=["Introvert", "Extrovert"])
disp.plot()
plt.show()

## Treina o modelo e gera o CSV

In [None]:
df_train = cut_dataframe(original_df, ['id', 'Personality'])
df_train.head()

In [None]:
df_train = preprocess(df_train, strategy="mean")
x = df_train.drop(columns=['label'])
y = df_train['label']
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(x, y)

In [None]:
test_df = pd.read_csv('../data/C1/test.csv')
test_df = cut_dataframe(test_df, ['id'])

columns = [column for column in test_df.columns if (column !='id' and column != 'Personality')]

for col in columns:
    if test_df[col].dropna().isin(["Yes", "No"]).all(): # Converts "Yes" and "No" columns to numerical booleans
        test_df[col] = test_df[col].map({"Yes": 1, "No": 0})
        original_df[col] = original_df[col].map({"Yes": 1, "No": 0})

    mean = original_df[col].mean()
    fill_value = round(mean)

    test_df[col] = test_df[col].fillna(fill_value).round().astype(int)


predictions = model.predict(test_df)

In [None]:
sample_submission = pd.read_csv('../data/C1/sample_submission.csv')

submission = sample_submission.copy()
submission["Personality"] = ["Extrovert" if value == 1 else "Introvert" for value in predictions]

submission.to_csv("../data/C1/submission.csv", index=False)