In [1]:
# Importar las bibliotecas necesarias
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets, linear_model
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
import joblib

In [2]:
# Read data
# X: features, y: labels
features = pd.read_csv("../JeuDeDonnees/alt_acsincome_ca_features_85(1).csv")
labels = pd.read_csv("../JeuDeDonnees/alt_acsincome_ca_labels_85.csv")  

In [None]:
# Print content
print("DATABASE 1")
print(features.head())
print(features.columns)

# Delete missing values - NULL, If 'MAR' or 'COW' contain NULL values 
features['AGEP'].hist(bins=30)
plt.title('Age Distribution')
plt.xlabel('Age')
plt.ylabel('Nb de personnes')
plt.show()
# Limpiar los datos eliminando filas con valores nulos en 'MAR' y 'COW'
features_df = features.dropna(subset=['MAR', 'COW'])

# Create relation between 'MAR' and 'COW' 
features_relation = features_df.groupby(['MAR', 'COW']).size().reset_index(name='QUANTITÉ')
print(features_relation)

# Create Graphic
plt.figure(figsize=(10, 6))
sns.barplot(data=features_relation, x='MAR', y='QUANTITÉ', hue='COW', palette='viridis')
plt.title('Relation entre MAR (État Civil) et COW (Occupation)')
plt.xlabel('MAR (État Civil)')
plt.ylabel('QUANTITÉ PERSONNES')
plt.legend(title='COW (Occupation)')
plt.show()

print("")
print("DATABASE 2")
# print(labels.head())  # Show first lines from the dataset
print(labels.columns)   # Show all the columns on the dataset

In [4]:
print("\n-----------------------------")
print("    SHUFFLE AND DIVIDED DATA   ")
print("-----------------------------\n")
# Shuffle and Divide data
X_train, X_test, y_train, y_test = train_test_split(
    features,           # X_train - X_test Características (features)
    labels.values.ravel(),             # y_train - y_test  (labels)
    test_size=0.2,      # Size test set (20%)
    random_state=42,    # Para reproducibilidad
    shuffle=False       # Mix data TRUE
)

print("\nSize TRAINING set:", X_train.shape)
print("TRAINING set:")
print(X_train) # Print Results

print("\nSize TEST set:", X_test.shape, "\n")
print("TEST set:")
print(X_test) # Print Results

# To standaliser the datas
my_scaler = StandardScaler()
X_train_Standed = my_scaler.fit_transform(X_train.select_dtypes(include=['float64','int64']))
X_test_Standed = my_scaler.fit_transform(X_test.select_dtypes(include=['float64','int64']))
joblib.dump (my_scaler, 'my_scaler.joblib')

# from sklearn.utils import shuffle
datos_shuffled = shuffle(features)


-----------------------------
    SHUFFLE AND DIVIDED DATA   
-----------------------------


Size TRAINING set: (133052, 10)
TRAINING set:
        AGEP  COW  SCHL  MAR    OCCP   POBP  RELP  WKHP  SEX  RAC1P
0       41.0  4.0  24.0  1.0  2555.0    6.0   1.0  60.0  2.0    1.0
1       77.0  7.0  22.0  1.0  4920.0   39.0   0.0  35.0  1.0    1.0
2       38.0  1.0  18.0  1.0   440.0    6.0   1.0  50.0  1.0    1.0
3       30.0  1.0  22.0  5.0  1555.0    6.0   2.0  80.0  1.0    6.0
4       36.0  1.0  16.0  1.0  4030.0  314.0   1.0  70.0  2.0    1.0
...      ...  ...   ...  ...     ...    ...   ...   ...  ...    ...
133047  47.0  1.0  18.0  1.0  5240.0   47.0   0.0  40.0  2.0    2.0
133048  23.0  1.0  18.0  5.0  4720.0    6.0  13.0  30.0  2.0    1.0
133049  18.0  1.0  16.0  5.0  4220.0    6.0   2.0  10.0  1.0    8.0
133050  41.0  1.0  21.0  1.0  4850.0   29.0   1.0  30.0  1.0    1.0
133051  51.0  1.0  19.0  1.0  1010.0    6.0  15.0  40.0  1.0    1.0

[133052 rows x 10 columns]

Size TEST set:

In [None]:
#SVM
# Validation Croisée
svm_model = SVC(kernel='linear')
scores = cross_val_score(svm_model,X_train,y_train,cv=5).mean()

# Accuracy & Classification report & Confusion matrix
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred_svm)
classification_rep = classification_report(y_test, y_pred_svm)
conf_matrix = confusion_matrix(y_test, y_pred_svm)

# Grid Search for best parameters
param_grid = {'C': [0.1, 1, 10, 20], 'kernel': ['rbf', 'poly']}
grid_search = GridSearchCV(SVC(), param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_

print("best_params = ", best_params)

In [None]:
# Models to be tested with params by default
models = {
    "RandomForest": RandomForestClassifier(),
    "AdaBoost": AdaBoostClassifier(),
    "GradientBoosting": GradientBoostingClassifier(),
    "SVM": SVC()
}

# Validación cruzada y métricas iniciales
for name, model in models.items():
    print(f"Training {name}...")
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    print(f"Accuracy average (Cross Validation): {np.mean(scores):.4f}")
    
    # Entrenar y evaluar en el conjunto de prueba
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    print(f"Precisión en el conjunto de prueba: {accuracy_score(y_test, y_pred):.4f}")
    print("Reporte de clasificación:")
    print(classification_report(y_test, y_pred))
    print("Matriz de confusión:")
    print(confusion_matrix(y_test, y_pred))
    print("-" * 40)

Training RandomForest...


In [None]:


#Forest 

rf_model = RandomForestClassifier()
cv_scores_rf = cross_val_score(rf_model, X_train, y_train, cv=5).mean()
print("validation croisée average score = ", cv_scores_rf)


rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
classification_rep_rf = classification_report(y_test, y_pred_rf)
conf_matrix_rf = confusion_matrix(y_test, y_pred_rf)


param_grid_rf = {'n_estimators': [50, 100, 120],'criterion':['gini','entropy','log_loss'],'max_depth': [None, 10, 20],'min_samples_split':[0.1, 1.0, 2]}
grid_search_rf = GridSearchCV(RandomForestClassifier(), param_grid_rf, cv=5)
grid_search_rf.fit(X_train, y_train)
best_params_rf = grid_search_rf.best_params_


