In [1]:
import pandas as pd
import plotly.graph_objects as go
import plotly.colors as pc
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm

In [2]:
def special_fill_nan(df:pd.DataFrame):
    columns = [column for column in df.columns if (column !='id' and column != 'Personality')]

    for col in columns:
        if df[col].dropna().isin(["Yes", "No"]).all(): # Converts "Yes" and "No" columns to numerical booleans
            df[col] = df[col].map({"Yes": 1, "No": 0})
            
    
    df_dict = df.to_dict(orient="records")

    for i in tqdm(range(len(df_dict))):
        nan_columns = [k for k, v in df_dict[i].items() if pd.isna(v)]
        base_columns = [k for k, v in df_dict[i].items() if (not pd.isna(v) and k!='id' and k != 'Personality')]

        for nan_column in nan_columns: # For every empty column in a row
            accumulate = 0
            for base_column in base_columns: # Takes the value of every non-empty column, filters by it and takes the mode of the empty column, then takes the mean
                mode = df.query(f"{base_column} == {df_dict[i][base_column]}")[nan_column].mode()[0]

                accumulate += mode

            mean = round(accumulate/len(base_columns))

            df_dict[i][nan_column] = int(mean)

    
    new_df = pd.DataFrame(df_dict)
    new_df[columns] = new_df[columns].astype(int)

    return new_df


def standard_fill_nan(df:pd.DataFrame, strategy="mode"):
    columns = [column for column in df.columns if (column !='id' and column != 'Personality')]

    for col in columns:
        if df[col].dropna().isin(["Yes", "No"]).all(): # Converts "Yes" and "No" columns to numerical booleans
            df[col] = df[col].map({"Yes": 1, "No": 0})

        if strategy == "mode":    
            mode = df[col].mode()[0]
            fill_value = round(mode)
        elif strategy == "mean":
            mean = df[col].mean()
            fill_value = round(mean)

        df[col] = df[col].fillna(fill_value).round().astype(int)

    return df

def preprocess(df:pd.DataFrame, strategy="mode"):
    if strategy == "mean" or strategy=="mode":
        return standard_fill_nan(df, strategy)
    elif strategy == "special":
        return special_fill_nan(df)

In [3]:
original_df = pd.read_csv("data/train.csv")
original_df['label'] = [1 if x == 'Extrovert' else 0 for x in original_df['Personality']]
original_df.head()

Unnamed: 0,id,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality,label
0,0,0.0,No,6.0,4.0,No,15.0,5.0,Extrovert,1
1,1,1.0,No,7.0,3.0,No,10.0,8.0,Extrovert,1
2,2,6.0,Yes,1.0,0.0,,3.0,0.0,Introvert,0
3,3,3.0,No,7.0,3.0,No,11.0,5.0,Extrovert,1
4,4,1.0,No,4.0,4.0,No,13.0,,Extrovert,1


In [4]:
original_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18524 entries, 0 to 18523
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   id                         18524 non-null  int64  
 1   Time_spent_Alone           17334 non-null  float64
 2   Stage_fear                 16631 non-null  object 
 3   Social_event_attendance    17344 non-null  float64
 4   Going_outside              17058 non-null  float64
 5   Drained_after_socializing  17375 non-null  object 
 6   Friends_circle_size        17470 non-null  float64
 7   Post_frequency             17260 non-null  float64
 8   Personality                18524 non-null  object 
 9   label                      18524 non-null  int64  
dtypes: float64(5), int64(2), object(3)
memory usage: 1.4+ MB


In [5]:
def cut_dataframe(data: pd.DataFrame, drop_columns: list[str]) -> pd.DataFrame:
    return data.drop(columns=drop_columns)

def split_data(data: pd.DataFrame, test_size: float = 0.2, seed: int = 42):
    x = data.drop(columns=['label'])
    y = data['label']

    return train_test_split(x, y, test_size=test_size, random_state=seed)

def train_random_forest(x_train, y_train, n_estimators: int = 100, seed: int = 42) -> RandomForestClassifier:
    model = RandomForestClassifier(n_estimators=n_estimators, random_state=seed)
    model.fit(x_train, y_train)

    return model

def test_random_forest(model: RandomForestClassifier, x_test, y_test):
    y_pred = model.predict(x_test)
    return accuracy_score(y_test, y_pred)

def measure_method(data: pd.DataFrame, strategy: str, n_estimators: int = 100, test_size: float = 0.2, seed: int = 42):
    processed_data = preprocess(data, strategy)
    x_train, x_test, y_train, y_test = split_data(processed_data, test_size=test_size, seed=seed)
    model = train_random_forest(x_train, y_train, n_estimators=n_estimators, seed=seed)
    print(test_random_forest(model, x_test, y_test))
    return model

## Dataset com Drained_after_socializing

In [6]:
df = cut_dataframe(original_df, ['id', 'Personality'])
df.head()

Unnamed: 0,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,label
0,0.0,No,6.0,4.0,No,15.0,5.0,1
1,1.0,No,7.0,3.0,No,10.0,8.0,1
2,6.0,Yes,1.0,0.0,,3.0,0.0,0
3,3.0,No,7.0,3.0,No,11.0,5.0,1
4,1.0,No,4.0,4.0,No,13.0,,1


### Mode Fill

In [7]:
model_mode = measure_method(df.copy(), strategy='mode')

0.9635627530364372


### Mean Fill

In [8]:
model_mean = measure_method(df.copy(), strategy='mean')

0.9635627530364372


### Special Fill

In [9]:
model_special = measure_method(df.copy(), strategy='special')

100%|██████████| 18524/18524 [03:15<00:00, 94.91it/s] 


0.9614035087719298


## Dataset sem Drained_after_socializing

In [10]:
df_cut = cut_dataframe(df, ['Drained_after_socializing'])

In [11]:
df_cut.head()

Unnamed: 0,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Friends_circle_size,Post_frequency,label
0,0.0,No,6.0,4.0,15.0,5.0,1
1,1.0,No,7.0,3.0,10.0,8.0,1
2,6.0,Yes,1.0,0.0,3.0,0.0,0
3,3.0,No,7.0,3.0,11.0,5.0,1
4,1.0,No,4.0,4.0,13.0,,1


### Mode Fill

In [12]:
model_cut_mode = measure_method(df_cut.copy(), strategy='mode')

0.9624831309041836


### Mean Fill

In [13]:
model_cut_mean = measure_method(df_cut.copy(), strategy='mean')

0.962753036437247


### Special Fill

In [14]:
model_cut_special = measure_method(df_cut.copy(), strategy='special')

100%|██████████| 18524/18524 [02:17<00:00, 134.51it/s]


0.9614035087719298
