

# <font size="+3"><span style='color:#2994ff'> **P7 - Implémentez un modèle de scoring** </span></font>


<a id='LOADING_LIBRARIES'></a>

---

---

<font size="+1"> **LOADING THE LIBRARIES** </font>

---

In [3]:
# File system management
import sys
import pandas as pd
import numpy as np
import os
import pickle

# Data drift evidently
import evidently
from evidently import ColumnMapping

from evidently.report import Report
from evidently.metrics.base_metric import generate_column_metrics
from evidently.metric_preset import DataDriftPreset, TargetDriftPreset, DataQualityPreset, RegressionPreset
from evidently.metrics import *

from evidently.test_suite import TestSuite
from evidently.tests.base_test import generate_column_tests
from evidently.test_preset import DataStabilityTestPreset, NoTargetPerformanceTestPreset, RegressionTestPreset
from evidently.tests import *


# Personnal packages
import tools_dataframe
import tools_preprocessing
import tools_feat_engineering
import tools_modeling


# Warnings
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')



  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


In [4]:
# Versions
print('Version of used libraries :')

print('Python    : ' + sys.version)
print('NumPy     : ' + np.version.full_version)
print('Pandas    : ' + pd.__version__)
print('Evidently : ' + evidently.__version__)

Version of used libraries :
Python    : 3.11.3 (main, Apr 19 2023, 23:54:32) [GCC 11.2.0]
NumPy     : 1.24.3
Pandas    : 1.5.3
Evidently : 0.4.0



<a id='notebook_settings'></a>


<br>


---
---

<font size="+1"> **NOTEBOOK SETTINGS** </font>

---


In [5]:
#################################
#    -- NOTEBOOK SETTINGS --    #
#################################

%matplotlib inline

# Random state
seed = 84

# Define training set size
TRAIN_SIZE = 0.8




<font size="+3"><span style='color:#2994ff'> **P7 - Implémentez un modèle de scoring** </span></font>



## <font color = '#0085dd'>**Table of content**</font>

---


[**Loading datasets**](#datasets_loading)
 * [Data preparation](#data_preparation)
 * [Datadrift analysis](#data_drift_analysis)

---


<a id='datasets_loading'></a>

---
---

# <span style='background:#2994ff'><span style='color:white'>**Loading datasets** </span></span>


In [6]:
# Define the folder containing the files with the project data
P7_scoring_credit = "/home/raquelsp/Documents/Openclassrooms/P7_implementez_modele_scoring/P7_travail/"

os.chdir(P7_scoring_credit)

In [7]:
# --------------------------------
# Files after feature engineering
# --------------------------------
# Open final train_dataset : these will be our reference data
path_train_data = \
    'P7_scoring_credit/preprocessing/train_data_fs_t25_combi_ML.pkl'
with open(path_train_data, 'rb') as f:
    train_data_fe = pickle.load(f)

# Open final test_dataset : these will be our current data
path_test_data = \
    'P7_scoring_credit/preprocessing/test_data_fs_t25_combi_ML.pkl'
with open(path_test_data, 'rb') as f:
    test_data_fe = pickle.load(f)

<a id='data_preparation'></a>

## <span style='background:#0085dd'><span style='color:white'>Data preparation</span></span>

**Data after feature engineering**

In [8]:
# --------------------
# Column description
# --------------------
info_train_data_fe = tools_dataframe.complet_description(train_data_fe)
info_train_data_fe.sample(5)

Unnamed: 0,Variable,Type,null,Duplicated,Filling percentage,count,mean,std,min,25%,50%,75%,max
0,AMT_CREDIT,float32,0,301908,100.0,307511.0,0.0,1.00001,-1.376496,-0.817476,-0.212415,0.520818,8.574059
24,EXT_SOURCE_1,float32,0,192927,100.0,307511.0,0.0,0.99995,-3.513017,0.012103,0.012103,0.012103,3.288063
29,CREDIT_ANNUITY_RATIO,float32,0,268417,100.0,307511.0,0.0,1.000057,-1.953928,-0.766587,-0.206075,0.701369,4.850109
19,REGION_POPULATION_RELATIVE,float32,0,307430,100.0,307511.0,0.0,0.999912,-1.487798,-0.785331,-0.145909,0.56357,3.733564
5,CURRENT_DEBT_TO_CREDIT_RATIO_MEAN_CREDITACTIVE...,float32,0,303514,100.0,307511.0,0.0,0.999875,-268.487915,-0.016791,-0.016791,-0.016791,207.717712


In [9]:
# Identify empty columns
to_remove = info_train_data_fe.loc[info_train_data_fe['Filling percentage']<1]
cols_to_remove = to_remove['Variable'].tolist()
print(f'There are {len(cols_to_remove)} empty columns')

There are 0 empty columns


In [10]:
# Remove empty columns
train_data_fe = train_data_fe[train_data_fe\
                    .columns[~train_data_fe.columns.isin(cols_to_remove)]]
test_data_fe = test_data_fe[test_data_fe\
                    .columns[~test_data_fe.columns.isin(cols_to_remove)]]

In [11]:
reference_fe = train_data_fe.drop(columns=['TARGET', 'SK_ID_CURR'])
print('Reference' + str(reference_fe.shape))
reference_fe_10000 = reference_fe.sample(n=10000, replace=False)

current_fe = test_data_fe.drop(columns=['SK_ID_CURR'])
print('Current' + str(current_fe.shape))
current_fe_10000 = current_fe.sample(n=10000, replace=False)

Reference(307511, 34)
Current(48744, 34)


<a id='data_drift_analysis'></a>

## <span style='background:#0085dd'><span style='color:white'>Datadrift analysis</span></span>

In [12]:
# Data after feature engineering
feng_report = Report(metrics=[DataDriftPreset(), ])

feng_report.run(reference_data=reference_fe_10000,
                current_data=current_fe_10000)

feng_report.save_html('feng_report10001.html')