In [1]:
#Classificadores
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# Separação de treino e teste 
from sklearn.model_selection import train_test_split

#Métricas
from sklearn.metrics import accuracy_score, classification_report

#Leitura básica de dados
import pandas as pd

#Validação cruzada
from sklearn.model_selection import cross_val_score

#OneHotEncoding
from sklearn.preprocessing import OneHotEncoder

In [2]:
data = pd.read_csv('fullyTreated2.csv') 
X = data.drop('depression', axis=1) #dados
y = data['depression'] #coluna a tentar ser prevista 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Dividindo conjuntos = (80% treino e 20% teste)

# Definir o mapeamento dos setores
sector_mapping = {
    'Unemployed': 0, 
    'Teacher': 1, 'HR Manager': 1, 'Pharmacist': 1, 'Judge': 1, 
    'Plumber': 1, 'Electrician': 1, 'Doctor': 1, 'Lawyer': 1,  
    
    'Business Analyst': 2, 'Marketing Manager': 2, 'Financial Analyst': 2, 
    'Finanancial Analyst': 2, 'Sales Executive': 2, 'Customer Support': 2, 
    'Entrepreneur': 2, 'Consultant': 2, 'Travel Consultant': 2, 
    'Chef': 2, 'Manager': 2,  
    
    'Software Engineer': 3, 'Data Scientist': 3, 'Mechanical Engineer': 3, 
    'Civil Engineer': 3, 'UX/UI Designer': 3, 'Graphic Designer': 3, 
    'Architect': 3, 'Digital Marketer': 3, 'Pilot': 3, 'Researcher': 3,  
    
    'Content Writer': 4, 'Chemist': 4, 'Educational Consultant': 4, 
    'Investment Banker': 4, 'Research Analyst': 4, 'Entrepreneur': 4, 
    'Consultant': 4, 'Chef': 4  
}

# Aplicar o mapeamento à coluna 'profession'
X_train['profession'] = X_train['profession'].map(sector_mapping)
X_test['profession'] = X_test['profession'].map(sector_mapping)

# Preencher valores NaN resultantes do mapeamento com uma nova categoria
X_train['profession'].fillna(-1, inplace=True)
X_test['profession'].fillna(-1, inplace=True)

# Identificar colunas categóricas restantes, se houver
categorical_columns = X_train.select_dtypes(include=['object']).columns

# Aplicar OneHotEncoder para converter variáveis categóricas em numéricas
encoder = OneHotEncoder(drop='first', sparse_output=False)

if len(categorical_columns) > 0:
    # Aplicar o OneHotEncoder no conjunto de treino
    X_train_encoded = pd.DataFrame(encoder.fit_transform(X_train[categorical_columns]), columns=encoder.get_feature_names_out(categorical_columns), index=X_train.index)
    # Aplicar o OneHotEncoder no conjunto de teste
    X_test_encoded = pd.DataFrame(encoder.transform(X_test[categorical_columns]), columns=encoder.get_feature_names_out(categorical_columns), index=X_test.index)
    
    # Substituir as colunas categóricas originais pelas codificadas
    X_train = X_train.drop(categorical_columns, axis=1)
    X_test = X_test.drop(categorical_columns, axis=1)
    X_train = pd.concat([X_train, X_train_encoded], axis=1)
    X_test = pd.concat([X_test, X_test_encoded], axis=1)




In [3]:
# Modelo 1: Random Forest
rf_model = RandomForestClassifier(max_depth=7)
rf_model.fit(X_train, y_train)

In [4]:
# Modelo 2: Gradient Boosting
gb_model = GradientBoostingClassifier(max_depth=3)
gb_model.fit(X_train, y_train)

In [5]:
#Avaliando modelos
rf_predictions = rf_model.predict(X_test)
gb_predictions = gb_model.predict(X_test)

In [6]:
print("Random Forest Model")
print("Accuracy:", accuracy_score(y_test, rf_predictions))
print(classification_report(y_test, rf_predictions))

print("\nGradient Boosting Model")
print("Accuracy:", accuracy_score(y_test, gb_predictions))
print(classification_report(y_test, gb_predictions))

Random Forest Model
Accuracy: 0.93359375
              precision    recall  f1-score   support

           0       0.94      0.99      0.96       429
           1       0.90      0.66      0.76        83

    accuracy                           0.93       512
   macro avg       0.92      0.82      0.86       512
weighted avg       0.93      0.93      0.93       512


Gradient Boosting Model
Accuracy: 0.962890625
              precision    recall  f1-score   support

           0       0.97      0.99      0.98       429
           1       0.92      0.84      0.88        83

    accuracy                           0.96       512
   macro avg       0.95      0.91      0.93       512
weighted avg       0.96      0.96      0.96       512



In [7]:
# Aplicar o mapeamento e OneHotEncoder ao conjunto completo de dados para validação cruzada
X['profession'] = X['profession'].map(sector_mapping)
X['profession'].fillna(-1, inplace=True)

categorical_columns = X.select_dtypes(include=['object']).columns

if len(categorical_columns) > 0:
    X_encoded = pd.DataFrame(encoder.transform(X[categorical_columns]), columns=encoder.get_feature_names_out(categorical_columns), index=X.index)
    X = X.drop(categorical_columns, axis=1)
    X = pd.concat([X, X_encoded], axis=1)

# Validação cruzada para Random Forest
rf_cv_scores = cross_val_score(rf_model, X, y, cv=5, scoring='f1')
print(f"Random Forest Cross-Validated F1 Score: {rf_cv_scores.mean()}")

# Validação cruzada para Gradient Boosting
gb_cv_scores = cross_val_score(gb_model, X, y, cv=5, scoring='f1')
print(f"Gradient Boosting Cross-Validated F1 Score: {gb_cv_scores.mean()}")

Random Forest Cross-Validated F1 Score: 0.7722783581740522
Gradient Boosting Cross-Validated F1 Score: 0.8841820459028253
