<a href="https://colab.research.google.com/github/RayNCode/code_collab/blob/main/FinalNotebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor

In [2]:
def preprocess_data_base(learn_dataset_job_path, work_desc_path, learn_dataset_path, dept_code_path, emp_contract_path, learn_dataset_sport_path, code_club_path, departments_path):
    # Chargement et fusion des datasets de travail
    learn_dataset_job = pd.read_csv(learn_dataset_job_path)
    work_desc = pd.read_csv(work_desc_path)
    merged_df = pd.merge(learn_dataset_job, work_desc, left_on='work_description', right_on='N3', how='left')

    # Conversion des colonnes N2, N1 et N3 en chaînes de caractères
    merged_df['N2'] = merged_df['N2'].astype(str)
    merged_df['N1'] = merged_df['N1'].astype(str)
    merged_df['N3'] = merged_df['N3'].astype(str)

    # Remplir les valeurs manquantes
    merged_df['N2'].fillna(merged_df['N3'].str[:-2], inplace=True)
    merged_df['N1'].fillna(merged_df['N2'].str[:-1], inplace=True)

    # Fusion avec d'autres datasets
    learn_dataset = pd.read_csv(learn_dataset_path)
    data_2 = pd.merge(learn_dataset, merged_df, on="Id", how="left")
    dept_code = pd.read_csv(dept_code_path)
    df_1 = data_2.merge(dept_code, on='insee_code')
    emp_contract = pd.read_csv(emp_contract_path)
    df_2 = df_1.merge(emp_contract, on='Id', how='left')
    learn_dataset_sport = pd.read_csv(learn_dataset_sport_path)
    df_3 = df_2.merge(learn_dataset_sport, on='Id', how='left')
    code_Club = pd.read_csv(code_club_path)
    df_4 = df_3.merge(code_Club, left_on='Club', right_on='Code', how='left')
    dep = pd.read_csv(departments_path)
    final_df = df_4.merge(dep, on='dep', how='left')

    # Conversion de type pour les colonnes 'Categorie' et 'REG'
    final_df['Categorie'] = final_df['Categorie'].astype(str)
    final_df['Categorie'] = final_df['Categorie'].astype('object')
    final_df['REG'] = final_df['REG'].astype(str)
    final_df['REG'] = final_df['REG'].astype('object')

    # Création et application d'une condition pour filtrer et imputer des valeurs
    condition = (final_df['ACTIVITY_TYPE'] != "TYPE1|1")
    final_df.loc[condition, ['EMOLUMENT', 'Working_hours']] = 0.0
    # Traitement des valeurs manquantes dans les colonnes catégorielles
    categorical_columns = final_df.select_dtypes(include=['object']).columns
    final_df[categorical_columns] = final_df[categorical_columns].fillna("None")


    return final_df


In [3]:
def preprocess_learn_test(final_df): #ajouter l'ID quelque part
    # Séparer la variable cible si elle est présente
    if 'target' in final_df.columns:
        y = final_df['target'].copy()
        y = np.where(y == 'B', 1, 0)
        X = final_df.drop(['target'], axis='columns')
    else:
        X = final_df.copy()
        X_test_id = X['Id'].copy()

    # Suppression des colonnes non nécessaires
    X = X.drop(["Id", 'Nom de la commune', 'Nom fédération', 'Nom catégorie', 'Nom du département', 'Code'], axis="columns")


    # Imputation des valeurs manquantes - Ce n'est certainement pas la façon la plus efficiente de le faire. À voir si l'on change.
    # imputer = IterativeImputer(estimator=RandomForestRegressor(), max_iter=50, random_state=0)
    # columns_to_impute = ['Working_hours', 'EMOLUMENT']
    # X[columns_to_impute] = imputer.fit_transform(X[columns_to_impute])

    # On devrait essayer avec XGB et voir les performances également.
    # imputer = IterativeImputer(estimator=XGBRegressor(), max_iter=50, random_state=0)
    # columns_to_impute = ['Working_hours', 'EMOLUMENT']
    # X[columns_to_impute] = imputer.fit_transform(X[columns_to_impute])

    if 'target' in final_df.columns:
      return X, y
    else:
      return X, X_test_id


In [4]:
learn_dataset_base = preprocess_data_base("/content/learn_dataset_job.csv",
                                      "/content/code_work_description_map.csv",
                                      "/content/learn_dataset.csv",
                                      "/content/city_adm.csv",
                                      "/content/learn_dataset_Emp_contract.csv",
                                      "/content/learn_dataset_sport.csv",
                                      "/content/code_Club.csv",
                                      "/content/departments.csv")
test_dataset_base = preprocess_data_base("/content/test_dataset_job.csv",
                                     "/content/code_work_description_map.csv",
                                     "/content/test_dataset.csv",
                                     "/content/city_adm.csv",
                                     "/content/test_dataset_Emp_contract.csv",
                                     "/content/test_dataset_sport.csv",
                                     "/content/code_Club.csv",
                                     "/content/departments.csv")

In [None]:
X_train, y_train = preprocess_learn_test(learn_dataset_base)
X_test, X_test_id = preprocess_learn_test(test_dataset_base)
print("Shape de X_train:", X_train.shape)
print("Shape de y_test:", y_train.shape)
print("Shape de X_test:", X_test.shape)

print("Valeurs nulles dans X_train:")
print(X_train.isnull().sum())

print("\nValeurs nulles dans X_test:")
print(X_test.isnull().sum())

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.utils import shuffle

# Mélange de X_train et y_train
X_train, y_train = shuffle(X_train, y_train, random_state=42)

In [7]:
# Paramètres pour XGBClassifier
xgb_params = {
    'subsample': 0.9000000000000001,
    'scale_pos_weight': 0.8300000000000001,
    'reg_lambda': 0.08,
    'reg_alpha': 0.8,
    'n_estimators': 151,
    'min_child_weight': 3.279999999999994,
    'max_depth': 5,
    'max_delta_step': 1.54,
    'learning_rate': 0.76,
    'gamma': 0.9980000000000002,
    'colsample_bytree': 0.7100000000000001,
    'colsample_bylevel': 0.9400000000000002
}

In [8]:
from sklearn.compose import ColumnTransformer

def process_data(X_train, X_test):
    # Transformer de colonnes pour le prétraitement
    preprocessing = ColumnTransformer(
        transformers=[
            ('imputer', IterativeImputer(estimator=RandomForestRegressor(), max_iter=50, random_state=0), ['Working_hours', 'EMOLUMENT']),
            ('onehot', OneHotEncoder(handle_unknown='ignore'), ['insee_code', "is_student", "OCCUPATION_42", "ACTIVITY_TYPE", "household", "sex", "employer_category", "job_category", "Terms_of_emp", "Eco_sect", "Job_dep", "WORK_CONDITION", "work_description", "N3", "N2", "N1", "town_type", "dep", "Emp_contract", "Club", "Categorie", 'REG']),
            ('ordinal', OrdinalEncoder(), ["Highest_degree", "EMPLOYEE_COUNT"])
        ],
        remainder='passthrough'
    )

    # Appliquer le prétraitement sur X_train et X_test
    X_train_preprocessed = preprocessing.fit_transform(X_train)
    X_test_preprocessed = preprocessing.transform(X_test)

    # Créer et entraîner le modèle XGBClassifier sur les données d'entraînement prétraitées
    xgb_classifier = XGBClassifier(random_state=42, **xgb_params)
    xgb_classifier.fit(X_train_preprocessed, y_train)

    return xgb_classifier, preprocessing, X_train_preprocessed, X_test_preprocessed

# Utilisation de la fonction
xgb_classifier, preprocessing, X_train_preprocessed, X_test_preprocessed = process_data(X_train, X_test)



In [9]:
# Maintenant, X_train_preprocessed est prêt pour l'entraînement, et X_test_preprocessed est prêt pour les prédictions.
predictions = xgb_classifier.predict(X_test_preprocessed)

In [10]:
predictions_labels = np.where(predictions == 1, 'B', 'G')

In [11]:
# Convertir les prédictions numériques en étiquettes catégorielles
predictions_labels = np.where(predictions == 1, 'B', 'G')

# Créer un DataFrame avec les Ids et les prédictions
results_df = pd.DataFrame({
    'Id': X_test_id,
    'Prediction': predictions_labels
})


In [19]:
results_df.to_csv("/content/predictions.csv", index=False)

In [17]:
# Calcul du décompte des valeurs pour les prédictions
value_counts = results_df['Prediction'].value_counts()

# Affichage du décompte
print(value_counts)


B    30388
G    19604
Name: Prediction, dtype: int64


In [20]:
# # Création du transformateur de colonnes pour le prétraitement
# preprocessing = make_column_transformer(
#     (OneHotEncoder(handle_unknown='ignore'), ['insee_code', "is_student", "OCCUPATION_42", "ACTIVITY_TYPE", "household", "sex", "employer_category", "job_category", "Terms_of_emp", "Eco_sect", "Job_dep", "WORK_CONDITION", "work_description", "N3", "N2", "N1", "town_type", "dep", "Emp_contract", "Club", "Categorie", 'REG']),
#     (OrdinalEncoder(), ["Highest_degree", "EMPLOYEE_COUNT"]),
#     (IterativeImputer(estimator=RandomForestRegressor(), max_iter=50, random_state=0), ['Working_hours', 'EMOLUMENT']),
#     remainder='passthrough'
# )

# # Création de l'instance XGBClassifier avec les paramètres
# xgb_classifier = XGBClassifier(random_state=42, **xgb_params)


# # Création du pipeline
# pipeline = Pipeline(steps=[
#     # ('imputer', imputer),
#     ('preprocessor', preprocessing),
#     ('classifier', xgb_classifier)
# ])
# pipeline

In [21]:
# def process_data(data, for_training=True):
#     # Transformer de colonnes pour le prétraitement
#     preprocessing = make_column_transformer(
#         (IterativeImputer(estimator=RandomForestRegressor(), max_iter=50, random_state=0), ['Working_hours', 'EMOLUMENT']),
#         (OneHotEncoder(handle_unknown='ignore'), ['insee_code', "is_student", "OCCUPATION_42", "ACTIVITY_TYPE", "household", "sex", "employer_category", "job_category", "Terms_of_emp", "Eco_sect", "Job_dep", "WORK_CONDITION", "work_description", "N3", "N2", "N1", "town_type", "dep", "Emp_contract", "Club", "Categorie", 'REG']),
#         (OrdinalEncoder(), ["Highest_degree", "EMPLOYEE_COUNT"]),
#         remainder='passthrough'
#     )

#     if for_training:
#         # Création de l'instance XGBClassifier avec les paramètres
#         xgb_classifier = XGBClassifier(random_state=42, **xgb_params)

#         # Pipeline pour l'entraînement
#         pipeline = Pipeline([
#             ('preprocessor', preprocessing),
#             ('classifier', xgb_classifier)
#         ])

#         return pipeline.fit(data[0], data[1])

#     else:
#         # Pipeline pour le traitement des données de test
#         pipeline = Pipeline([
#             ('preprocessor', preprocessing)
#         ])

#         return pipeline.fit_transform(data)

# pipeline = process_data((X_train, y_train), for_training=True)
# X_test_transformed = process_data(X_test, for_training=False)
