In [368]:
# Bibliotecas
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time

# Dividir em dados de treino e teste
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

# Escalonamento
from sklearn.preprocessing import StandardScaler

# Modelos de classificação
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [369]:
# Importando o Dataset

base = pd.read_csv('personality_datasert.csv')

In [370]:
# Obtendo informações do dataset

print(f'Total de linhas: {base.shape[0]}')
print(f'Total de colunas: {base.shape[1]}')
print()
print(f'Total de dados nulos: \n{base.isnull().sum()}')
print('-'*50)
base.info()

Total de linhas: 2900
Total de colunas: 8

Total de dados nulos: 
Time_spent_Alone             0
Stage_fear                   0
Social_event_attendance      0
Going_outside                0
Drained_after_socializing    0
Friends_circle_size          0
Post_frequency               0
Personality                  0
dtype: int64
--------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2900 entries, 0 to 2899
Data columns (total 8 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Time_spent_Alone           2900 non-null   float64
 1   Stage_fear                 2900 non-null   object 
 2   Social_event_attendance    2900 non-null   float64
 3   Going_outside              2900 non-null   float64
 4   Drained_after_socializing  2900 non-null   object 
 5   Friends_circle_size        2900 non-null   float64
 6   Post_frequency             2900 non-null   float64
 7   Persona

In [371]:
# Pegando as Colunas numericas e categoricas
colunas_numericas = base.select_dtypes(include=['int64', 'float64']).columns
colunas_categoricas = base.select_dtypes(include=['object']).columns
colunas_categoricas = colunas_categoricas.drop('Personality')

In [372]:
# Tratando as colunas categoricas e transformando para valores binários
for col in colunas_categoricas:
    base[col] = base[col].map({'Yes': 1, 'No': 0})


# Extrovertido: 1
# Introvertido: 0
base['Personality'] = base['Personality'].map({'Extrovert': 1, 'Introvert': 0})

In [373]:
base.head()

Unnamed: 0,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
0,4.0,0,4.0,6.0,0,13.0,5.0,1
1,9.0,1,0.0,0.0,1,0.0,3.0,0
2,9.0,1,1.0,2.0,1,5.0,2.0,0
3,0.0,0,6.0,7.0,0,14.0,8.0,1
4,3.0,0,9.0,4.0,0,8.0,5.0,1


In [374]:
# Definindo X e y
X = base.drop('Personality', axis=1)
y = base['Personality']

In [375]:
# Modelos
modelos = {
    'LogisticRegression': LogisticRegression(),
    'KNeighborsClassifier': KNeighborsClassifier(),
    'DecisionTreeClassifier': DecisionTreeClassifier(),
    'RandomForestClassifier': RandomForestClassifier(),
    'SVC': SVC()
}

In [376]:
# Separando o dataset entre treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Padronizando os dados
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

resultados = {}

for nome, modelo in modelos.items():
    modelo.fit(X_train, y_train)
    resultados[nome] = modelo.score(X_test, y_test)

resultados = pd.Series(resultados)
resultados.sort_values(ascending=False)

Unnamed: 0,0
SVC,0.926437
KNeighborsClassifier,0.922989
LogisticRegression,0.91954
RandomForestClassifier,0.909195
DecisionTreeClassifier,0.856322


In [377]:
# Usando o cross Validation
resultados_cv = {}

for nome, modelo in modelos.items():
    scores = cross_val_score(modelo, X, y, cv=5)
    resultados_cv[nome] = scores.mean()

resultados_cv = pd.Series(resultados_cv)
resultados_cv.sort_values(ascending=False)

Unnamed: 0,0
SVC,0.934483
LogisticRegression,0.928621
KNeighborsClassifier,0.927241
RandomForestClassifier,0.917931
DecisionTreeClassifier,0.878276


In [435]:
# Fazer predição de um novo dado Utilizando o SVC

nova_pessoa = {
    'Time_spent_Alone': 3,
    'Stage_fear': 1,
    'Social_event_attendance': 4,
    'Going_outside': 5,
    'Drained_after_socializing': 0,
    'Friends_circle_size': 8,
    'Post_frequency': 6
}


In [436]:
# Converter novo registro para DataFrame
nova_pessoa_df = pd.DataFrame([nova_pessoa])

# Padronizando os dados do novo registro
nova_pessoa_scaled = scaler.transform(nova_pessoa_df)

# Pegando o modelo SVC (Teve o melhor desempenho)
svc_model = modelos['SVC']

# Fazer o predict da personalidade
predicao = svc_model.predict(nova_pessoa_scaled)

# Output the classification result
if predicao[0] == 1:
    print("Personalidade: Extrovertido")
else:
    print("Personalidade: Introvertido")

Personalidade: Extrovertido
