<a href="https://colab.research.google.com/github/RayNCode/code_collab/blob/main/XGB_Predictions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from xgboost import XGBClassifier
import xgboost as xgb
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.utils import shuffle

In [None]:
data_url = "https://raw.githubusercontent.com/RayNCode/code_collab/main/project-3-files"

In [None]:
learn_dataset = pd.read_csv(f"{data_url}/learn_dataset.csv").copy()
learn_dataset_Emp_contract = pd.read_csv(f"{data_url}/learn_dataset_Emp_contract.csv").copy()
learn_dataset_sport = pd.read_csv(f"{data_url}/learn_dataset_sport.csv").copy()
learn_dataset_job = pd.read_csv(f"{data_url}/learn_dataset_job.csv").copy()

code_work_description_map = pd.read_csv(f"{data_url}/code_work_description_map.csv").copy()
city_adm = pd.read_csv(f"{data_url}/city_adm.csv").copy()
code_Club = pd.read_csv(f"{data_url}/code_Club.csv").copy()
departments = pd.read_csv(f"{data_url}/departments.csv").copy()

test_dataset_job = pd.read_csv(f"{data_url}/test_dataset_job.csv").copy()
test_dataset = pd.read_csv(f"{data_url}/test_dataset.csv").copy()
test_dataset_Emp_contract = pd.read_csv(f"{data_url}/test_dataset_Emp_contract.csv").copy()
test_dataset_sport = pd.read_csv(f"{data_url}/test_dataset_sport.csv").copy()

In [None]:
def preprocess_data_base(learn_dataset_job, work_desc, learn_dataset, dept_code, emp_contract, learn_dataset_sport, code_club, departments):
    # Chargement et fusion des datasets de travail
    merged_df = pd.merge(learn_dataset_job, work_desc, left_on='work_description', right_on='N3', how='left')

    # Conversion des colonnes N2, N1 et N3 en chaînes de caractères
    merged_df['N2'] = merged_df['N2'].astype(str)
    merged_df['N1'] = merged_df['N1'].astype(str)
    merged_df['N3'] = merged_df['N3'].astype(str)

    # Remplir les valeurs manquantes
    merged_df['N2'].fillna(merged_df['N3'].str[:-2], inplace=True)
    merged_df['N1'].fillna(merged_df['N2'].str[:-1], inplace=True)

    # Fusion avec d'autres datasets
    data_2 = pd.merge(learn_dataset, merged_df, on="Id", how="left")
    df_1 = data_2.merge(dept_code, on='insee_code')
    df_2 = df_1.merge(emp_contract, on='Id', how='left')
    df_3 = df_2.merge(learn_dataset_sport, on='Id', how='left')
    df_4 = df_3.merge(code_Club, left_on='Club', right_on='Code', how='left')
    final_df = df_4.merge(departments, on='dep', how='left')

    # Conversion de type pour les colonnes 'Categorie' et 'REG'
    final_df['Categorie'] = final_df['Categorie'].astype(str)
    final_df['Categorie'] = final_df['Categorie'].astype('object')
    final_df['REG'] = final_df['REG'].astype(str)
    final_df['REG'] = final_df['REG'].astype('object')

    # Création et application d'une condition pour filtrer et imputer des valeurs
    condition = (final_df['ACTIVITY_TYPE'] != "TYPE1|1")
    final_df.loc[condition, ['EMOLUMENT', 'Working_hours']] = 0.0
    # Traitement des valeurs manquantes dans les colonnes catégorielles
    categorical_columns = final_df.select_dtypes(include=['object']).columns
    final_df[categorical_columns] = final_df[categorical_columns].fillna("None")


    return final_df


In [None]:
def preprocess_learn_test(final_df): #ajouter l'ID quelque part
    # Séparer la variable cible si elle est présente
    if 'target' in final_df.columns:
        y = final_df['target'].copy()
        y = np.where(y == 'B', 1, 0)
        X = final_df.drop(['target'], axis='columns')
    else:
        X = final_df.copy()
        X_test_id = X['Id'].copy()

    # Suppression des colonnes non nécessaires
    X = X.drop(["Id", 'Nom de la commune', 'Nom fédération', 'Nom catégorie', 'Nom du département', 'Code'], axis="columns")


    # Imputation des valeurs manquantes - Ce n'est certainement pas la façon la plus efficiente de le faire. À voir si l'on change.
    # imputer = IterativeImputer(estimator=RandomForestRegressor(), max_iter=50, random_state=0)
    # columns_to_impute = ['Working_hours', 'EMOLUMENT']
    # X[columns_to_impute] = imputer.fit_transform(X[columns_to_impute])

    # On devrait essayer avec XGB et voir les performances également.
    # imputer = IterativeImputer(estimator=XGBRegressor(), max_iter=50, random_state=0)
    # columns_to_impute = ['Working_hours', 'EMOLUMENT']
    # X[columns_to_impute] = imputer.fit_transform(X[columns_to_impute])

    if 'target' in final_df.columns:
      return X, y
    else:
      return X, X_test_id



In [None]:
learn_dataset_base = preprocess_data_base(learn_dataset_job, code_work_description_map, learn_dataset, city_adm, learn_dataset_Emp_contract, learn_dataset_sport, code_Club, departments)
test_dataset_base = preprocess_data_base(test_dataset_job, code_work_description_map, test_dataset, city_adm, test_dataset_Emp_contract, test_dataset_sport, code_Club,departments)

In [None]:
X_train, y_train = preprocess_learn_test(learn_dataset_base)
X_test, X_test_id = preprocess_learn_test(test_dataset_base)
print("Shape de X_train:", X_train.shape)
print("Shape de y_test:", y_train.shape)
print("Shape de X_test:", X_test.shape)

In [None]:
# Mélange de X_train et y_train
X_train, y_train = shuffle(X_train, y_train, random_state=42)

In [None]:
# Paramètres pour XGBClassifier
xgb_params = {
    'subsample': 0.9000000000000001,
    'scale_pos_weight': 0.8300000000000001,
    'reg_lambda': 0.08,
    'reg_alpha': 0.8,
    'n_estimators': 151,
    'min_child_weight': 3.279999999999994,
    'max_depth': 5,
    'max_delta_step': 1.54,
    'learning_rate': 0.76,
    'gamma': 0.9980000000000002,
    'colsample_bytree': 0.7100000000000001,
    'colsample_bylevel': 0.9400000000000002
}

In [None]:
def process_data(X_train, X_test):
    # Transformer de colonnes pour le prétraitement
    preprocessing = ColumnTransformer(
        transformers=[
            ('imputer', IterativeImputer(estimator=RandomForestRegressor(), max_iter=100, random_state=0), ['Working_hours', 'EMOLUMENT']),
            ('onehot', OneHotEncoder(handle_unknown='ignore'), ['insee_code', "is_student", "OCCUPATION_42", "ACTIVITY_TYPE", "household", "sex", "employer_category", "job_category", "Terms_of_emp", "Eco_sect", "Job_dep", "WORK_CONDITION", "work_description", "N3", "N2", "N1", "town_type", "dep", "Emp_contract", "Club", "Categorie", 'REG']),
            ('ordinal', OrdinalEncoder(), ["Highest_degree", "EMPLOYEE_COUNT"])
        ],
        remainder='passthrough'
    )

    # Appliquer le prétraitement sur X_train et X_test
    X_train_preprocessed = preprocessing.fit_transform(X_train)
    X_test_preprocessed = preprocessing.transform(X_test)

    xgb_classifier = XGBClassifier(random_state=42, **xgb_params)
    xgb_classifier.fit(X_train_preprocessed, y_train)

    return  xgb_classifier, preprocessing, X_train_preprocessed, X_test_preprocessed

# Utilisation de la fonction
xgb_classifier, preprocessing, X_train_preprocessed, X_test_preprocessed = process_data(X_train, X_test)




In [None]:
y_pred = xgb_classifier.predict(X_test_preprocessed)
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Calcul de l'exactitude
accuracy = accuracy_score(y_test, y_pred)
print("Exactitude (Accuracy): {:.2f}%".format(accuracy * 100))

# Calcul de la précision
precision = precision_score(y_test, y_pred)
print("Précision (Precision): {:.2f}%".format(precision * 100))

# Calcul du rappel
recall = recall_score(y_test, y_pred)
print("Rappel (Recall): {:.2f}%".format(recall * 100))

# Calcul du F1-score
f1 = f1_score(y_test, y_pred)
print("F1-score: {:.2f}".format(f1))

# Matrice de confusion
confusion = confusion_matrix(y_test, y_pred)
print("Matrice de confusion:\n", confusion)

In [None]:
print("Shape de X_test:", X_train_preprocessed.shape)

Shape de X_test: (49993, 14740)


In [None]:
print("Shape de X_test:", X_test_preprocessed.shape)

Shape de X_test: (49992, 14740)


In [None]:
# Maintenant, X_train_preprocessed est prêt pour l'entraînement, et X_test_preprocessed est prêt pour les prédictions.
predictions = xgb_classifier.predict(X_test_preprocessed)

In [None]:
# Convertir les prédictions numériques en étiquettes catégorielles
predictions_labels = np.where(predictions == 1, 'B', 'G')

# Créer un DataFrame avec les Ids et les prédictions
results_df = pd.DataFrame({
    'Id': X_test_id,
    'target': predictions_labels
})

results_df.to_csv("/kaggle/working/predictions.csv", index=False)
# Calcul du décompte des valeurs pour les prédictions
value_counts = results_df['target'].value_counts()

# Affichage du décompte
print(value_counts)

