Goal: Build a predictive model to estimate a socio-economic status characteristic.
Key Deliverables:
- Predictive model results.
- A comparison of at least two machine learning algorithms.
- Predictions on the test set in the specified format.
- A reproducible, high-quality report.

In [48]:
# IMPORTS 
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
from sklearn import impute, experimental
import os
import seaborn as sns

from sklearn.experimental import enable_iterative_imputer  
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

ModuleNotFoundError: No module named 'seaborn'

In [19]:
# LOADING DATA
    # Learning data
learn_df = pd.read_csv(r'learn_dataset.csv')
learn_df_job = pd.read_csv(r'learn_dataset_job.csv')
learn_df_emp = pd.read_csv(r'learn_dataset_Emp.csv')
learn_df_sport = pd.read_csv(r'learn_dataset_sport.csv')
learn_df_ret_former = pd.read_csv(r'learn_dataset_retired_former.csv')
learn_df_sport_ret_job = pd.read_csv(r'learn_dataset_retired_jobs.csv')
learn_df_sport_ret_pension = pd.read_csv(r'learn_dataset_retired_pension.csv')

    # Test data
test_df = pd.read_csv(r'test_dataset.csv')
test_df_job = pd.read_csv(r'test_dataset_job.csv')
test_df_emp = pd.read_csv(r'test_dataset_Emp.csv')
test_df_sport = pd.read_csv(r'test_dataset_sport.csv')
test_df_ret_former = pd.read_csv(r'test_dataset_retired_former.csv')
test_df_sport_ret_job = pd.read_csv(r'test_dataset_retired_jobs.csv')
test_df_sport_ret_pension = pd.read_csv(r'test_dataset_retired_pension.csv')

    # Mapping data
# To load au fur et à mesure des besoins

# With PRIMARY_KEY used to link datasets - the Identifier 
# Merging all
list_learn = [
    'learn_df', 'learn_df_job', 'learn_df_emp', 'learn_df_sport',
    'learn_df_ret_former', 'learn_df_sport_ret_job', 'learn_df_sport_ret_pension'
]

list_test = [
    'test_df', 'test_df_job', 'test_df_emp', 'test_df_sport',
    'test_df_ret_former', 'test_df_sport_ret_job', 'test_df_sport_ret_pension'
]

# building the dictionary for the function (to merge all)
learn_dic = {name: globals()[name] for name in list_learn}
test_dic = {name: globals()[name] for name in list_test}

def merge_dfs(datasets_dic, key_column, merge_type="left"):
    # Load the first dataset to initialize main_df
    main_df = list(datasets_dic.values())[0]

    # Iterate over the remaining datasets and merge them
    for dataset in list(datasets_dic.values())[1:]:
        main_df = main_df.merge(dataset, on=key_column, how=merge_type)
    
    return main_df

### Exploring the data

In [23]:
learn_merged_data = merge_dfs(learn_dic, key_column="PRIMARY_KEY") # no missings on the target column
test_merged_data = merge_dfs(test_dic, key_column="PRIMARY_KEY")

In [11]:
learn_merged_data.columns
test_merged_data.columns

Index(['PRIMARY_KEY', 'insee', 'Occupation_42', 'Age_2018', 'IS_STUDENT',
       'highest_diploma', 'Act', 'FAMILY_TYPE', 'SEX', 'WORKING_HOURS_x',
       'EMPLOYER_CATEGORY_x', 'WAGES', 'JOB_DESCRIPTION_x', 'JOB_CATEGORY_x',
       'CONTRACT_TYPE_x', 'JOB_DEP_x', 'Job_condition_x', 'activity_sector_x',
       'Employee_count_x', 'Emp', 'club', 'Former_occupation_42', 'Former_emp',
       'retirement_age', 'activity_sector_y', 'Former_dep', 'JOB_DEP_y',
       'JOB_DESCRIPTION_y', 'CONTRACT_TYPE_y', 'JOB_CATEGORY_y',
       'Job_condition_y', 'EMPLOYER_CATEGORY_y', 'WORKING_HOURS_y',
       'Employee_count_y', 'pension_plan_payments'],
      dtype='object')

In [None]:
summary = learn_merged_data.describe() # for numerical values
summary

Unnamed: 0,PRIMARY_KEY,Age_2018,WORKING_HOURS,WAGES,retirement_age,ret_WORKING_HOURS,pension_plan_payments
count,50043.0,50043.0,19339.0,19352.0,13019.0,10994.0,11003.0
mean,50081.387087,49.420578,1480.866074,22688.625775,60.323681,1345.436329,18582.447423
std,28898.102717,20.624439,540.123814,14133.362614,2.873665,598.269351,7828.204575
min,2.0,15.0,7.0,314.0,34.0,34.0,8881.0
25%,25108.5,32.0,1211.0,13198.5,60.0,892.0,13163.0
50%,50071.0,49.0,1676.0,20484.0,60.0,1517.0,16987.0
75%,75024.0,65.0,1820.0,28970.5,62.0,1820.0,21834.0
max,100084.0,119.0,3000.0,146523.0,70.0,3000.0,149194.0


In [27]:
missing_rate = learn_merged_data.isnull().mean() * 100
missing_rate # wesh that's a lot of missings RIP

PRIMARY_KEY               0.000000
insee                     0.000000
Occupation_42             0.000000
Age_2018                  0.000000
IS_STUDENT                0.000000
highest_diploma           0.000000
Act                       0.000000
FAMILY_TYPE               0.000000
SEX                       0.000000
target                    0.000000
WORKING_HOURS            61.355234
EMPLOYER_CATEGORY        62.596167
WAGES                    61.329257
CONTRACT_TYPE            61.329257
JOB_DEP                  61.377216
activity_sector          61.329257
Emp                      51.641588
club                     87.069121
Former_occupation_42     73.984373
Former_emp               73.984373
retirement_age           73.984373
ret_activity_sector      78.012909
Former_dep               78.700318
ret_JOB_DEP              78.662350
ret_CONTRACT_TYPE        78.012909
ret_EMPLOYER_CATEGORY    79.515617
ret_WORKING_HOURS        78.030893
pension_plan_payments    78.012909
dtype: float64

### Data preprocessing - CLEANING + handling the missings and unsure variables are formatted corectly 

In [None]:
# renaming columns for retirement 
learn_merged_data.rename(columns={'JOB_DESCRIPTION_x': 'JOB_DESCRIPTION', 
                                'WORKING_HOURS_x': 'WORKING_HOURS',
                                'EMPLOYER_CATEGORY_x': 'EMPLOYER_CATEGORY',
                                'JOB_CATEGORY_x': 'JOB_CATEGORY',
                                'CONTRACT_TYPE_x': 'CONTRACT_TYPE',
                                'JOB_DEP_x': 'JOB_DEP',
                                'activity_sector_x': 'activity_sector',
                                'Employee_count_x': 'Employee_count',
                                'activity_sector_y': 'ret_activity_sector',
                                'JOB_DEP_y': 'ret_JOB_DEP',
                                'JOB_DESCRIPTION_y': 'ret_JOB_DESCRIPTION',
                                'CONTRACT_TYPE_y': 'ret_CONTRACT_TYPE',
                                'JOB_CATEGORY_y': 'ret_JOB_CATEGORY',
                                'Job_condition_y': 'ret_Job_condition',
                                'EMPLOYER_CATEGORY_y': 'ret_EMPLOYER_CATEGORY',
                                'WORKING_HOURS_y': 'ret_WORKING_HOURS',
                                'Employee_count_y': 'ret_Employee_count'}, inplace=True)


# clean a bit and dropping coluumns - If a variable has too much missing data (e.g., >50%)
# # we drop it, especially if it isn't critical to the target variable.
learn_merged_data.drop(columns=['Job_condition_x', 
                                'ret_Job_condition',
                                'JOB_DESCRIPTION', 
                                'ret_JOB_DESCRIPTION', 
                                'JOB_CATEGORY', 
                                'ret_JOB_CATEGORY',
                                'Employee_count',
                                'ret_Employee_count'], inplace=True)

In [29]:
    # Missings - sachant qu'ils ne faut pas les drop du test set: "observations with missing data cannot be removed from the test set"
# c'est important car: 
# Handling cases where some information is entirely missing for a given person 
# permettra de ensure that our model can still make meaningful predictions 
# without introducing bias or inaccuracies.
   
    # isolons les rows with missings 

print(learn_merged_data.shape[0])

df_na_only = learn_merged_data[learn_merged_data.isna().any(axis=1)]
print(df_na_only.shape[0]) # all individuals have at least one missing value 

df_na_only['NaN_count'] = df_na_only.isna().sum(axis=1) # max sum NA = 18 
df_max_na = df_na_only[df_na_only['NaN_count'] == 18]
print(df_max_na.shape[0]) # 10 595 individuals where : information entirely missing (outside the initial info coming from the learn_df)


50043
50043
10595


DONE BUT TO CHECK AGAIN : on pourrait fill les missings selon le type de la variable 
Numerical variables - on fill en utilisant the patterns in other variables to estimate missing values.
    # Example: Iterative Imputation with multiple models

    # DOCUMENTATION: 
The IterativeImputer in sklearn is a great tool for this, as it models each feature with missing values 
as a function of other features and iteratively imputes missing values.

    # Initialize IterativeImputer with different estimators (models)
You can specify different models for each feature using the 'estimator' parameter

Using LinearRegression and RandomForestRegressor as estimators
LinearRegression for imputation of continuous features
RandomForestRegressor can be used to model non-linear relationships

Note: You can use any estimator that works for your data (regression models, decision trees, etc.)

In [36]:
iterative_imputer = IterativeImputer(
    estimator=RandomForestRegressor(n_estimators=10, random_state=42),
    max_iter=10,  # Maximum number of iterations
    random_state=42
)

# this method only handles numerical values
numerical_cols = learn_merged_data.select_dtypes(include=['float64', 'int64']).columns

# Apply the iterative imputer to fill missing values
# This will fill missing values using patterns in other variables
# df_imputed = pd.DataFrame(iterative_imputer.fit_transform(learn_merged_data), columns=learn_merged_data.columns)
learn_merged_data[numerical_cols] = iterative_imputer.fit_transform(learn_merged_data[numerical_cols])




In [None]:
# Categorical variables - replace missings with "unkonwn" 
# This allows the model to treat missing values as a separate category 
# rather than just ignoring them or replacing them with the most common value.
categorical_cols = learn_merged_data.select_dtypes(include=['object']).columns
for col in categorical_cols:
    learn_merged_data[col].fillna('unknown', inplace=True)

In [None]:
missing_rate = learn_merged_data.isnull().mean() * 100
missing_rate # OK à vérifier sur la méthode et la cohérence des values qui ont été fill

TODO : Should we reduce the dataset to keep a predictive power ? Genre filtrer que sur certaines villes/sur une tranche d'âges ...

In [None]:
# Correlation for numerical variables
correlation_matrix = learn_merged_data.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt='.2f')
plt.title('Correlation Matrix')
plt.show()

### Build predictive models - generate predictions

### Evaluate and optimize