In [1]:
import pandas as pd
import csv
import os

# Fonction pour nettoyer les noms de colonnes
def clean_columns(df):
    df.columns = df.columns.str.strip().str.replace(r'\s+', '_', regex=True).str.normalize('NFKD')
    return df


In [2]:
# Charger les données nettoyées
application_train_cleaned = pd.read_csv('cleaned_application_train.csv')
application_test_cleaned = pd.read_csv('cleaned_application_test.csv')

# Nettoyer les noms de colonnes
application_train_cleaned = clean_columns(application_train_cleaned)
application_test_cleaned = clean_columns(application_test_cleaned)

# print("Columns in the cleaned application_train_cleaned.csv:")
# print(application_train_cleaned.columns.tolist())


In [3]:
# Fonction pour traiter les valeurs manquantes et encoder les variables catégorielles
def preprocess_data(df):
    # Traitement des valeurs manquantes
    df = df.fillna(df.select_dtypes(include=['number']).mean())
    df = df.fillna(df.select_dtypes(include=['object']).mode().iloc[0])

    # Sauvegarde des colonnes avant encodage pour vérification
    original_columns = df.columns.tolist()

    # Encodage des variables catégorielles
    df = pd.get_dummies(df, drop_first=True)

    # Affichage des colonnes ajoutées après encodage
#     print("New columns after encoding:", set(df.columns) - set(original_columns))

    return df

application_train_cleaned = preprocess_data(application_train_cleaned)
application_test_cleaned = preprocess_data(application_test_cleaned)


In [4]:
from sklearn.preprocessing import StandardScaler

# Fonction pour normaliser les données
def normalize_data(df, reference_df):
    scaler = StandardScaler()
    reference_columns = reference_df.columns.tolist()

    # Vérification des colonnes manquantes
    missing_cols = set(reference_columns) - set(df.columns)
    if missing_cols:
        for col in missing_cols:
            df[col] = 0

    df = df[reference_columns]  # Assurez-vous que l'ordre des colonnes est identique

    df[df.columns] = scaler.fit_transform(df)
    return df

# Normaliser les données en utilisant le DataFrame de référence
application_train_cleaned = normalize_data(application_train_cleaned, application_train_cleaned)
application_test_cleaned = normalize_data(application_test_cleaned, application_train_cleaned)


In [15]:
import pandas as pd
from evidently.report import Report
from evidently.metric_preset import DataDriftPreset

# Les 10 principales features à analyser
features = ['DAYS_BIRTH', 'AMT_INCOME_TOTAL', 'DAYS_REGISTRATION',
            'DAYS_ID_PUBLISH', 'DAYS_EMPLOYED', 'SK_ID_CURR',
            'DAYS_LAST_PHONE_CHANGE', 'AMT_ANNUITY', 'AMT_CREDIT',
            'AMT_GOODS_PRICE']

# Filtrer les datasets pour ne conserver que les principales features
application_train_filtered = application_train_cleaned[features]
application_test_filtered = application_test_cleaned[features]

# Vérification de la colonne TARGET
if 'TARGET' in application_train_cleaned.columns:
    print("The 'TARGET' column is present and correctly named.")

    # Création du rapport Evidently avec le preset DataDriftPreset
    report = Report(metrics=[DataDriftPreset()])
    report.run(reference_data=application_train_filtered, current_data=application_test_filtered)

    # Sauvegarde du rapport en HTML
    report.save_html('./data_drift_report.html')

    print("Le rapport de Data Drift a été généré et sauvegardé sous 'data_drift_report.html'.")
else:
    print("The TARGET column is missing from the application_train_cleaned.csv file.")
    # Affichage des colonnes pour diagnostic
    print("Columns in application_train_cleaned.csv:", application_train_cleaned.columns.tolist())


The 'TARGET' column is present and correctly named.
Le rapport de Data Drift a été généré et sauvegardé sous 'data_drift_report.html'.


In [None]:
!git init

In [None]:
!git config --global user.email "nini.foudil92@gmail.com"
!git config --global user.name "NINI"


In [None]:
!del .git\index.lock


In [None]:

!git add P7_Analyse_exploratoire.ipynb
!git add p7_api.ipynb
!git add datap7/application_train_cleaned.csv
!git add datap7/application_test_cleaned.csv


In [None]:
!git commit -m "Ajout des fichiers notebooks et CSV pour le projet"


In [None]:
!git remote set-url origin https://github.com/Nini92/p7ocr.git


In [None]:
!git push -u origin master


In [16]:
!git config --global user.name "Nini92"
!git config --global user.email "nini.foudil92@gmail.com"


In [17]:
!mkdir NINI_P7
!cd NINI_P7
!git init


Reinitialized existing Git repository in C:/Users/Foudil/Projet7/.git/


In [18]:
!git remote add origin https://github.com/Nini92/NINI_P7.git


fatal: ../P7_Analyse_exploratoire.ipynb: '../P7_Analyse_exploratoire.ipynb' is outside repository at 'C:/Users/Foudil/Projet7'
fatal: ../p7_api.ipynb: '../p7_api.ipynb' is outside repository at 'C:/Users/Foudil/Projet7'
fatal: ../data_drift_report.html: '../data_drift_report.html' is outside repository at 'C:/Users/Foudil/Projet7'
fatal: ../cleaned_application_test.csv: '../cleaned_application_test.csv' is outside repository at 'C:/Users/Foudil/Projet7'
fatal: ../cleaned_application_train.csv: '../cleaned_application_train.csv' is outside repository at 'C:/Users/Foudil/Projet7'
