# Importation des librairies

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

import boto3
import sagemaker


sm_boto3 = boto3.client('sagemaker')
sess = sagemaker.Session()
region = sess.boto_session.region_name
bucket_name = "LE NOM DE TON BUCKET S3"

print("Utilisation de l'instance SageMaker dans la région {} et le bucket {}".format(region, bucket_name))

# Charger les données du DataSet

Dataset : https://www.kaggle.com/datasets/krishnaraj30/finance-loan-approval-prediction-data

In [80]:
train_file_path = './datasets/train.csv'
test_file_path = './datasets/test.csv'

df_train = pd.read_csv(train_file_path)

# Analyse du Dataset

In [None]:
info = df_train.info() 
tete = df_train.head() 
data_shape = df_train.shape 

no_of_rows, no_of_columns = (data_shape) 
no_of_features = no_of_columns - 1
tot_num_data = no_of_rows * no_of_columns


print (f'Informations générales sur les données :\n{info}')
print ('-----------------------------------------------------------------------\n')

print (f'Dimensions des données :\n{data_shape}\n')

print (f'Nombre de lignes : {no_of_rows}')
print (f'Nombre de colonnes : {no_of_columns}')
print (f'Nombre de features : {no_of_features}')
print (f'Nombre total de données : {tot_num_data}')


# Données sous forme de Tableau

In [None]:
print("Données vue tableaux")

tete

## Gestion des valeurs nuls/Nan dans le Dataset

In [None]:
#Checking for missing values

valeurs_null = df_train.isnull().sum()

print ('Les données manquantes pour chaque colonne: \n', valeurs_null)

## Completer les valeurs nulles dans le dataset



2 types de données : 

- Catégorie (Gender, Married, Dependents, Self-employed, Credit history) -> Completer par le mode de la colonne

- Numérique (Loan amount. Loan amount term) -> Completer par la moyenne

# Pourquoi ? 

* Au lieu de supprimer et perdre beaucoup de données -> on complete par des valeurs non impactante

* En régression logistique on ne peut avoir de valeur NaN

Mode = valeur la plus fréquente

on remplace par la valeur deja majoritaire ce qui n'influence pas le jeu de donnée

Moyenne 

On complete par la moyenne ce qui influence que trés peu le jeu de donnée

In [84]:
#Compléter les valeurs manquantes pour les colonnes catégories

df_train['Gender'] = df_train['Gender'].fillna(df_train['Gender'].mode()[0])
df_train['Married'] = df_train['Married'].fillna(df_train['Married'].mode()[0])
df_train['Dependents'] = df_train['Dependents'].fillna(df_train['Dependents'].mode()[0])
df_train['Self_Employed'] = df_train['Self_Employed'].fillna(df_train['Self_Employed'].mode()[0])
df_train['Credit_History'] = df_train['Credit_History'].fillna(df_train['Credit_History'].mode()[0])

In [85]:
#Compléter les valeurs manquantes pour les données numériques

df_train['LoanAmount'] = df_train['LoanAmount'].fillna(df_train['LoanAmount'].mean())
df_train['Loan_Amount_Term'] = df_train['Loan_Amount_Term'].fillna(df_train['Loan_Amount_Term'].mean())


In [None]:
#Double check des valeurs nulles après le remplissage
df_train.isnull().sum()


In [None]:
#Visualisation des données manquantes
sns.heatmap(df_train.isnull(), cbar=False, cmap='viridis')

## Exploration des données

## Visualisation des données

### Données numériques

In [None]:
#Using scatterplot to check for outliers
from pandas.plotting import scatter_matrix

num_columns = ['LoanAmount', 'Loan_Amount_Term', 'ApplicantIncome', 'CoapplicantIncome']
scatter_matrix(df_train[num_columns], figsize = (12, 8))

In [None]:
sns.pairplot(df_train)

In [None]:
#Further examination of numerical outliers
plt.figure(figsize = (15,10))
sns.boxplot(data=df_train)


In [None]:
plt.figure(figsize = (15, 10))

Outlier_check = df_train[num_columns]
sns.stripplot(data = Outlier_check, palette='dark:red', jitter = 0.3, size = 5)

plt.title('Outlier Check')


plt.show()

### Données catégories

In [None]:
# Visualisng all categorical columns at once
cat_columns = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Credit_History', 'Property_Area', 'Loan_Status']


#setting up plotting environment
num_cat = len(cat_columns)

fig, axes = plt.subplots(nrows=num_cat, ncols=1, figsize = (10, 5*num_cat))

#Plotting barchat of each categorical columns
for i, col in enumerate(cat_columns):
    sns.countplot(data=df_train, x=col, ax=axes[i], hue='Loan_Status', palette='Set2')
    axes[i].set_title(f'plot of {col}')
    axes[i].set_xticklabels(axes[i].get_xticklabels(), rotation = 45)

plt.tight_layout()
plt.show()

In [None]:
dependents_0 = df_train[df_train['Dependents'] == '0']
total_dependents_0 = len(dependents_0)
loan_status_no = dependents_0[dependents_0['Loan_Status'] == 'No']
percent_loan_status_no = (len(loan_status_no)/total_dependents_0)*100

percent_loan_status_no

In [None]:
# Step 1: Filter rows where Dependents is '0'
dependents_zero = df_train[df_train['Dependents'] == '0']

# Step 2: Calculate the total number of Dependents '0'
total_dependents_zero = len(dependents_zero)
total_dependents_notzero = len(df_train[df_train['Dependents'] != '0'])

# Step 3: Filter rows where Loan_Status is 'No' from the dependents_zero subset
loan_status_no = dependents_zero[dependents_zero['Loan_Status'] == 'N']

# Step 4: Calculate the percentage of Loan_Status 'No' in Dependents '0'
percentage_no_loan = (len(loan_status_no) / total_dependents_zero) * 100

print(f"Total number of Dependents '0': {total_dependents_zero}")
print(f"Total number of Dependents not '0': {total_dependents_notzero}")
print(f"Percentage of Dependents '0' with Loan_Status 'No': {percentage_no_loan:.2f}%")
print (len(df_train['Dependents']))
print (len(loan_status_no))

## Gestions des données improbables/aberrantes

Nous utilisons l'amplitude interquartile (IQR) pour identifier les valeurs aberrantes.

Conventionnellement, les valeurs situées en dehors de 1,5 × IQR sont généralement considérées comme des valeurs aberrantes.

In [95]:
#Calcul IQR pour Applicantincome

Q1_app = df_train['ApplicantIncome'].quantile(0.25)
Q3_app = df_train['ApplicantIncome'].quantile(0.75)
IQR = Q3_app - Q1_app

lowerbound_app = Q1_app - 1.5*IQR
upperbound_app = Q3_app + 1.5*IQR

outliers_app = df_train[(df_train['ApplicantIncome'] < lowerbound_app) | (df_train['ApplicantIncome'] > upperbound_app)]

#Calcul IQR pour CoapplicantIncome

Q1_co = df_train['CoapplicantIncome'].quantile(0.25)
Q3_co = df_train['CoapplicantIncome'].quantile(0.75)
IQR_co = Q3_co - Q1_co

lowerbound_co = Q1_co - 1.5*IQR_co
upperbound_co = Q3_co + 1.5*IQR_co

outliers_co = df_train[(df_train['CoapplicantIncome'] < lowerbound_co) | (df_train['CoapplicantIncome'] > upperbound_co)]

#Suppression des données improbables
df_train = df_train[~((df_train['ApplicantIncome'] < lowerbound_app) | (df_train['ApplicantIncome'] > upperbound_app))]
df_train = df_train[~((df_train['CoapplicantIncome'] < lowerbound_co) | (df_train['CoapplicantIncome'] > upperbound_co))]

In [None]:
plt.figure(figsize = (15, 9))

outlier_check2 = df_train[num_columns]
sns.stripplot(data=outlier_check2, palette='dark:red', jitter = 0.3, size = 5)
plt.show()

print ('Dimension du dataset après avoir traité les données improbables : ', df_train.shape)

### Obervations de corrélations

In [None]:
#Selection des colonnes numériques uniquement
numeric_df = df_train.select_dtypes(include=[np.number]) 

plt.figure(figsize=(12, 9))

sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm')
plt.show()

* On peut observer que Applicant Income est corréléer à Loan_Status
* CoApplicant aussi mais légérement moins corréler

#### Rappel 

Plus la valeur est proche de 1 plus la variable est corrélée et inversement

# Feature Engineering (ou l'art de combiner des variables)

## OneHotEncoding des valeurs de Catégories

Les valeurs de catégories vont être convertis en valeurs numériques faciliter le modèle (et faire la Régression Logistique)

In [98]:
df_train = pd.get_dummies(df_train, columns = cat_columns, drop_first=True)

## Normalisation des features (valeurs X)

- Simplifie les calculs (+ rapide)


### !!! Veiller à "dé-normaliser" aussi les valeurs tests pour obtenir les bonnes valeurs !!!


In [None]:
df_train.head()[["ApplicantIncome","CoapplicantIncome","LoanAmount"]]

In [None]:
max_income = df_train['ApplicantIncome'].max()
max_co_income = df_train['CoapplicantIncome'].max()
max_loan_amount = df_train['LoanAmount'].max()

min_income = df_train['ApplicantIncome'].min()
min_co_income = df_train['CoapplicantIncome'].min()
min_loan_amount = df_train['LoanAmount'].min()

print("-------------------------------------------------")
print(f"Maximum ApplicantIncome: {max_income}")
print(f"Minimum ApplicantIncome: {min_income}")
print("-------------------------------------------------")
print(f"Maximum CoapplicantIncome: {max_co_income}")
print(f"Minimum CoapplicantIncome: {min_co_income}")
print("-------------------------------------------------")
print(f"Maximum LoanAmount: {max_loan_amount}")
print(f"Minimum LoanAmount: {min_loan_amount}")

In [101]:
from sklearn.preprocessing import StandardScaler

standard_scaler= StandardScaler()

df_train[['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount']] = standard_scaler.fit_transform(df_train[['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount']])

In [None]:
df_train.head()

In [None]:
import json

scaler_params = {
    'mean': standard_scaler.mean_.tolist(),
    'scale': standard_scaler.scale_.tolist()
}

scaler_params

In [104]:
with open('scaler_params.json', 'w') as f:
    json.dump(scaler_params, f)
    
s3_client = boto3.client('s3')
s3_client.upload_file('scaler_params.json', bucket_name, 'scalers/scaler_params.json')
s3_client.close()

## Création de nouvelles features

Combiner des valeurs pour créer de nouvelles variables

Nouvelles relations :

* Total Income = Applicant Income + Co-applicant income
* Loan-to-income ratio = Loan amount / Total income

### Pourquoi ?

- Permet de réduire le nombre de données 
- éviter le over-fitting

over-fitting : quand le modèle apprend trop précisement les données et donc évite la généralisation

In [None]:
#Total Income

df_train['Total_Income'] = df_train['ApplicantIncome'] + df_train['CoapplicantIncome']

#Loan-to-income Ratio

df_train['Loan_to_Income'] = df_train['LoanAmount']/df_train['Total_Income'] 

# Viewing the updated features
df_train.head()

## Pourcentage de répartition des classes 


In [None]:
df_train['Loan_Status_Y'].value_counts(normalize=True)*100

### Pourquoi ? 

Les données de la vie réelle sont un gros bordel. 

Donc ca nous permet d'identifier : 

- Un déséquilibre des classes
- ajuster les techniques de modélisations
- savoir quel métriques(valeurs) d'évaluation

Dans ce cas présent la répartition est bonne.

Pas besoin donc de rajouter des valeurs ou de les corriger pour l'overfitting ou underfitting

In [None]:
# On va supprimer les colonnes Loan_ID et Loan_Amount_Term

df_train = df_train.drop(['Loan_ID', 'Loan_Amount_Term'], axis=1)

column_update = {'Gender_Male': 'Gender', 'Married_Yes': 'Married',
                'Self_Employed_Yes': 'Self_Employed', 'Loan_Status_Y': 'Loan_Status' }

df_train.rename(columns=column_update, inplace=True)

#Display updated dataset
df_train.head()

# Construction du modèle

## Préparation des données

1. Séparer le Dataset en Features (X) and Targets (Y)

2. Train-Test Split: Split data into training and test sets (80% train, 20% test).

In [None]:
features = list(df_train.columns)
print(features)
label = features.pop(-1)

X = df_train[features] #Features

Y = df_train["Loan_Status"] #Target

print ('Shape of X: ', X.shape)
print ('Shape of Y: ', Y.shape)

X.head()

## Documentation de Scikit-Learn

Documentation : https://scikit-learn.org/0.21/tutorial/basic/tutorial.html

Vidéo sur le sujet : https://www.youtube.com/watch?v=0B5eIE_1vpU&pp=ygUSc2Npa2l0IGxlYXJuIGJhc2lj

Livre Hands on Machine Learning: https://www.amazon.fr/Hands-Machine-Learning-Scikit-learn-Tensorflow/dp/1098125975/ref=asc_df_1098125975?mcid=c65f200626d13a1ab3d5944cd898aebd&tag=googshopfr-21&linkCode=df0&hvadid=701510839370&hvpos=&hvnetw=g&hvrand=15667932878931070552&hvpone=&hvptwo=&hvqmt=&hvdev=c&hvdvcmdl=&hvlocint=&hvlocphy=9109552&hvtargid=pla-1651497364252&psc=1&gad_source=1

In [112]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state = 42)

In [None]:
X_test.isnull().sum()

In [None]:
X_train.isnull().sum()

## Importation des données sur S3

In [115]:
X_train.to_csv('train-V-1.csv', index=False)
X_test.to_csv('test-V-1.csv', index=False)


In [116]:
sk_prefix = "sagemaker/finance-loan-prediction/sklearncontainer"

train_path_s3 = sess.upload_data(path="train-V-1.csv", bucket=bucket_name, key_prefix=sk_prefix)

test_path_s3 = sess.upload_data(path="test-V-1.csv", bucket=bucket_name, key_prefix=sk_prefix)

In [None]:
print(f"Training data uploaded to: {train_path_s3}")
print(f"Test data uploaded to: {test_path_s3}")

In [None]:
%%writefile script.py


from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score, roc_curve, auc
import sklearn
import joblib
import boto3
import pathlib
from io import StringIO 
import argparse
import joblib
import os
import numpy as np
import pandas as pd

# inference functions ---------------

# def input_fn(request_body, request_content_type):
#     print(request_body)
#     print(request_content_type)
#     if request_content_type == "text/csv":
#         request_body = request_body.strip()
#         try:
#             df = pd.read_csv(StringIO(request_body), header=None)
#             return df
        
#         except Exception as e:
#             print(e)
#     else:
#         return """Please use Content-Type = 'text/csv' and, send the request!!""" 
 
    
def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir, "model.joblib"))
    return clf

# def predict_fn(input_data, model):
#     if type(input_data) != str:
#         prediction = model.predict(input_data)
#         print(prediction)
#         return prediction
#     else:
#         return input_data
        
    
if __name__ == "__main__":

    print("[INFO] Extracting arguments")
    parser = argparse.ArgumentParser()

    # hyperparameters sent by the client are passed as command-line arguments to the script.
    parser.add_argument("--n_estimators", type=int, default=100)
    parser.add_argument("--random_state", type=int, default=0)

    # Data, model, and output directories
    parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN"))
    parser.add_argument("--test", type=str, default=os.environ.get("SM_CHANNEL_TEST"))
    parser.add_argument("--train-file", type=str, default="train-V-1.csv")
    parser.add_argument("--test-file", type=str, default="test-V-1.csv")

    args, _ = parser.parse_known_args()
    
    print("SKLearn Version: ", sklearn.__version__)
    print("Joblib Version: ", joblib.__version__)

    print("[INFO] Reading data")
    print()
    train_df = pd.read_csv(os.path.join(args.train, args.train_file))
    test_df = pd.read_csv(os.path.join(args.test, args.test_file))
    
    features = [col for col in train_df.columns if col != 'Loan_Status']
    label = "Loan_Status"
    
    print("Building training and testing datasets")
    print()
    X_train = train_df[features]
    X_test = test_df[features]
    y_train = train_df[label]
    y_test = test_df[label]

    print('Column order: ')
    print(features)
    print()
    
    print("Label column is: ",label)
    print()
    
    print("Data Shape: ")
    print()
    print("---- SHAPE OF TRAINING DATA (80%) ----")
    print(X_train.shape)
    print(y_train.shape)
    print()
    print("---- SHAPE OF TESTING DATA (20%) ----")
    print(X_test.shape)
    print(y_test.shape)
    print()
    
  
    print("Training RandomForest Model.....")
    print()
    model =  RandomForestClassifier(n_estimators=args.n_estimators, random_state=args.random_state, verbose = 3,n_jobs=-1)
    model.fit(X_train, y_train)
    print()
    

    model_path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(model,model_path)
    print("Model persisted at " + model_path)
    print()

    
    y_pred_test = model.predict(X_test)
    test_acc = accuracy_score(y_test,y_pred_test)
    test_rep = classification_report(y_test,y_pred_test)

    print()
    print("---- METRICS RESULTS FOR TESTING DATA ----")
    print()
    print("Total Rows are: ", X_test.shape[0])
    print('[TESTING] Model Accuracy is: ', test_acc)
    print('[TESTING] Testing Report: ')
    print(test_rep)

In [None]:
! python script.py --model-dir ./ \
                   --train ./ \
                   --test ./ \

In [120]:
from sagemaker.sklearn.estimator import SKLearn

FRAMEWORK_VERSION = "0.23-1"

sklearn_estimator = SKLearn(
    entry_point="script.py",
    role="TON EXECUTION ROLE ARN",
    instance_count=1,
    instance_type="ml.m5.large",
    framework_version=FRAMEWORK_VERSION,
    base_job_name="RF-custom-sklearn",
    hyperparameters={
        "n_estimators": 100,
        "random_state": 0,
    },
    use_spot_instances = True,
    max_wait = 7200,
    max_run = 3600
)

In [None]:
# launch training job, with asynchronous call
sklearn_estimator.fit({"train": train_path_s3, "test": test_path_s3}, wait=True)
# sklearn_estimator.fit({"train": datapath}, wait=True)


In [None]:
sklearn_estimator.latest_training_job.wait(logs="None")
artifact = sm_boto3.describe_training_job(
    TrainingJobName=sklearn_estimator.latest_training_job.name
)["ModelArtifacts"]["S3ModelArtifacts"]

print("Model artifact persisted at " + artifact)


In [None]:
artifact

In [124]:
from sagemaker.sklearn.model import SKLearnModel
from time import gmtime, strftime

model_name = "Custom-sklearn-model-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
model = SKLearnModel(
    name=model_name,
    model_data=artifact,
    role="TON EXECUTION ROLE ARN",
    entry_point="script.py",
    framework_version=FRAMEWORK_VERSION,
)


In [None]:
model

In [None]:
endpoint_name = "Custom-sklearn-model-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print("EndpointName={}".format(endpoint_name))

predictor = model.deploy(
    initial_instance_count=1,
    instance_type="ml.m4.xlarge",
    endpoint_name=endpoint_name,
)


In [None]:
df_test = pd.read_csv(test_file_path)

cat_columns_test = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Credit_History', 'Property_Area']



df_test = pd.get_dummies(df_test, columns = cat_columns_test, drop_first=True)

column_update = {'Gender_Male': 'Gender', 'Married_Yes': 'Married',
                'Self_Employed_Yes': 'Self_Employed', 'Loan_Status_Y': 'Loan_Status' }

df_test.rename(columns=column_update, inplace=True)

df_test = df_test.drop(['Loan_ID', 'Loan_Amount_Term'], axis=1)

df_test

In [None]:
df_test[['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount']] = standard_scaler.transform(df_test[['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount']])

df_test['Total_Income'] = df_test['ApplicantIncome'] + df_test['CoapplicantIncome']

df_test

In [None]:
valeur_test = df_test[[col for col in df_test.columns if col != 'Loan_Status']][0:5]

valeur_test

In [None]:
print(predictor.predict(valeur_test))