In [None]:
#importation des bibliothèques 

#Le module "rcParams" est utilisé pour configurer les paramètres de rendu dans Matplotlib, tels que les tailles de police, les couleurs par défaut, etc.
#%matplotlib inline :  spécifique à Jupyter Notebook . Cette directive indique à Jupyter d'afficher les graphiques générés par Matplotlib directement dans le notebook, plutôt que dans une fenêtre séparée. 
#NumPy : Une bibliothèque Python essentielle pour les calculs scientifiques
#Pandas : Une bibliothèque Python puissante pour la manipulation et l'analyse de données, offrant des structures de données flexibles (Series et DataFrame) pour importer, nettoyer, transformer et analyser des jeux de données de manière efficace.
#Seaborn : Une bibliothèque de visualisation de données basée sur Matplotlib
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import rcParams
%matplotlib inline

### load DATA

In [None]:
#importation et affichage de jeu de données 
df = pd.read_excel(r'C:\Users\SALMA\Downloads\DATASET.xlsm')
df
# Public placeholder – change locally
DATA_PATH = "data/your_file.csv"   # <- à remplacer en local
df = pd.read_excel(DATA_PATH)
df.to_csv('Prediction.csv', index=False)

#### Data Preparation

In [None]:
# Replace commas with periods in the concerned columns 
df[['irr', 'Yield Index', 'IRR @ SELL', 'Gross Margin @ Mature Year']] = \
    df[['irr', 'Yield Index', 'IRR @ SELL', 'Gross Margin @ Mature Year']].replace(',', '.', regex=True)

In [None]:
df2=df.copy()
df2.shape

In [None]:
# Find the indices of the lines to delete
indices_a_supprimer = df2[df2['project_type'].isin(['Type_A', 'Type_B', 'Type_C', 'Type_D','Type_E'])].index
# Delete rows with found clues
df2 = df2.drop(indices_a_supprimer)
df2.shape

In [None]:
# Remove lines where 'Cost Bucket' equals 'Yes'
df2 = df2[df2['Cost bucket'] != 'Yes']
df2.shape

In [None]:
df2.nature.unique()

In [None]:
# Use the Pandas "apply" function to apply a function to each element in the "nature" column
def transform_nature(nature):
    if nature == 'Project':
        return 'Project'
    else:
        return 'Agile'

df2['nature'] = df2['nature'].apply(transform_nature)

# Afficher le résultat
df2.head()

In [None]:
# Select the specific columns we want to keep
df2 = df2.loc[:, ['project_id','portfolio_level_1','Pportfolio_level_2','Budget Prior', 'project_type', 'nature','irr','Framework', 'status',
'stage_OPEN','stage_CLOSE','stage_SELL','stage_ENGAGE','stage_DO','stage_SELECT','stage_IMPLEMENT','stage_PRODUCE','Program Holder','Program ID', 'Horizon', 'Business Case Id'
,'Budget Total','Budget Total+CAPEX','Target Y+1','Total Costs + CAPEX','Costs Total']]

# Remove all other columns
df2 = df2.drop(columns=[col for col in df2.columns if col not in ['project_id','portfolio_level_1','portfolio_level_2','Budget Prior','Type', 
'nature','irr', 'Framework','Program ID', 'status','stage_OPEN','stage_CLOSE','stage_SELL','stage_ENGAGE','stage_DO','stage_SELECT','stage_IMPLEMENT','stage_PRODUCE','Program Holder', 'Horizon',
'Business Case Id','Budget Total','Budget Total+CAPEX','Target Y+1','Total Costs + CAPEX','Costs Total']])

# Afficher le résultat
df2.head()

In [None]:
df2.shape

In [None]:
#Transform the specified columns in the list by removing white spaces and converting the values ​​to datetime objects in the format "YYYY-MM-DD".
columns_to_transform = ['stage_OPEN','stage_CLOSE','stage_SELL','stage_ENGAGE','stage_DO','stage_SELECT','stage_IMPLEMENT','stage_PRODUCE']

for col in columns_to_transform:
    df2[col] = df2[col].str.strip()
    df2[col] = pd.to_datetime(df2[col], format="%Y-%m-%d")


In [None]:
date_columns = ['stage_OPEN','stage_CLOSE','stage_SELL','stage_ENGAGE','stage_DO','stage_SELECT','stage_IMPLEMENT','stage_PRODUCE']
# calculate the duration for each row and store the results in a new column "Project Duration"
df2['Project Duration'] = df2[date_columns].max(axis=1) - df2[date_columns].min(axis=1)
df2['Project Duration'] = df2['Project Duration'].dt.days
# Delete date columns
df2 = df2.drop(columns=date_columns)

In [None]:
df2.shape

In [None]:
# Select the specific columns we want to keep
indices_a_supprimer = df2[df2['status'].isin(['Frozen','Opened','Under Simulation', 'Planned'])].index
# Remove all other columns
df2 = df2.drop(indices_a_supprimer)
df2.shape

In [None]:
# Count the number of missing or zero values ​​in "Project Duration" for each "status" value
counts = df2.groupby('status')['Project Duration'].apply(lambda x: (x.isnull() | (x == 0)).sum())

print("number of missing or zero values ​​in 'Project Duration' for each 'status' value:")
print(counts)

In [None]:
# Count the number of missing values ​​in "Project Duration" for each "status" value
missing_counts = df2.groupby('status')['Project Duration'].apply(lambda x: x.isnull().sum())

# Count the number of zeros in "Project Duration" for each "status" value
zero_counts = df2.groupby('status')['Project Duration'].apply(lambda x: (x == 0).sum())

print("number of missing or zero values ​​in 'Project Duration' for each 'status' value:")
print(missing_counts)
print("\nNumber of zeros in 'Project Duration' for each 'status' value:")
print(zero_counts)

In [None]:
# Define a function that takes a value from the Horizon column and returns the corresponding value
def get_horizon_value(Horizon):
    if Horizon == 'H1 ':
        return 0
    elif Horizon == 'H3 ':
        return 1
    else:
        return 0.5

# Apply the function to each item in the Horizon column to create a new "Horizon Value" column
df2['Horizon'] = df2['Horizon'].apply(get_horizon_value)
df2.head()

In [None]:
# Définissez une fonction qui prend une valeur de la colonne Horizon et retourne la valeur correspondante
def get_horizon_value(Framework):
    if Framework == 'Waterfall':
        return 0
    elif Framework == 'Agile':
        return 1
    else:
        return 0.5

# Appliquez la fonction à chaque élément de la colonne Horizon pour créer une nouvelle colonne "Horizon Value"
df2['Framework'] = df2['Framework'].apply(get_horizon_value)

In [None]:
# creation of the “Project Size” column
df2['Project Size'] = df2[['Budget Total','Budget Prior','Budget Total+CAPEX','Total Costs + CAPEX', 'Costs Total']].max(axis=1)

# affichage du résultat
df2.describe()

In [None]:
df2.loc[df2["status"]=="Cancelled"].sort_values("Project Size")

In [None]:
df2.loc[df2["Project Size"]>500000]["Project Size"].hist(bins=100, figsize=(30, 5))

In [None]:
df2.columns

In [None]:
df2.info()

In [None]:
# Transformation de la colonne 'Program ID'
#Converts the values ​​in the 'Program ID' /Business Case Id' column to 1 if they are not empty, otherwise converts them to 0.
df2['Program ID'] = df2['Program ID'].apply(lambda x: 1 if x.strip() != '' else 0)
df2['Business Case Id'] = df2['Business Case Id'].apply(lambda x: 1 if x.strip() != '' else 0)

In [None]:
cols_to_use = ['project_id','portfolio_level_1','Pportfolio_level_2','Budget Prior', 'project_type', 'nature','irr','Framework', 'status',
'Horizon', 'Business Case Id','Project Size','Project Duration']
df3 = df2[cols_to_use].copy()
df3.shape

In [None]:
df3['Project Size'].value_counts()

In [None]:
df3.shape

In [None]:
# Load and add canceled projects from CSV file extrated from df_snap (datatset rolling 12 month)
cancelled_rows = pd.read_csv(r'C:\Users\SALMA\Desktop\python\cancelled_rows.csv')

In [None]:
# Concatenate rows from canceled_rows to DF2
df5 = pd.concat([df3, cancelled_rows]).drop_duplicates(subset='ID')

# Check count of 'Project Size' values ​​after adding
print(df5['Project Size'].value_counts(dropna=False))

In [None]:
# Define the transformation function
def strip_spaces(value):
    return value.strip()

# Apply the function to the "Program Holder" column
df5['Program Holder'] = df5['Program Holder'].apply(strip_spaces)

In [None]:
df5.shape

In [None]:
# Select projects with a value of 0 in "Project Size" and a status "Completed"
df_filtered = df5.loc[(df5['Project Size'] == 0) & (df5['status'] == 'Completed')]

# Delete selected projects
df5 = df5.drop(df_filtered.index)

# Afficher le DataFrame mis à jour
df5.shape


In [None]:
# purpose : Project Size defines the project budget. We aimed to select the most relevant information from EOLE85. We discovered numerous 0 values in this column, so we transformed them into NaN values. 
# Additionally, we excluded some completed projects for which budget information was unavailable.
# Liste des colonnes à remplacer
colonnes_a_remplacer = ['Project Size']

# Replace all 0 values ​​in column ['Project Size'] to be replaced with NaN
for colonne in colonnes_a_remplacer:
    df5[colonne] = df5[colonne].replace(0, np.nan)

In [None]:
mapping2 = {'EM': 0, 'IA':1}
mapping4 = {'Project': 0, 'Agile': 1}
mapping5 = {'Completed': 1, 'Cancelled': 0}
mapping7 = { 'yes':1 ,'no':0 }

colonnes_mapping = {'portfolio_level_1': mapping2, 'nature': mapping4, 'status': mapping5 ,'Program Holder' :mapping7}

for colonne, mapping in colonnes_mapping.items():
    df5[colonne] = df5[colonne].replace(mapping)

df5.head()

In [None]:
df5.columns

In [None]:
df5 = df5.drop('irr',axis=1)

In [None]:
df5['Project Size'].value_counts(dropna=False)

In [None]:
# Copy "Project Duration" column from df2 using ID as key
df5['Project Duration'] = df5['ID'].map(df2.set_index('ID')['Project Duration'])

# Afficher le DataFrame résultant
df5.head()

In [None]:
df5.columns

In [None]:
for status, statusdf in df5.groupby("status"):
    print(status)
    statusdf["Project Duration"].hist(bins=100, figsize=(30, 5))
    plt.show()

In [None]:
#df5["Project Size sqrt"] = df5["Project Size"].apply(np.sqrt)
#df5["Project Duration sqrt"] = df5["Project Duration"].apply(np.sqrt)
#col=['Project Size','Project Duration']
#df5 = df5.drop(col,axis =1 )

### Data Imputation 

In [None]:
weights = {'portfolio_level_1': 0.5 ,'portfolio_level_2': 1 , 'project_type': 1, 'nature': 0.5,
           'Framework': 0.5, 'status': 0.5, 'Program Holder': 0.5, 'Program ID': 0.5,
           'Horizon': 1, 'Business Case Id': 0.5 ,'Project Size' :1 }


def weighted_distance(row1, row2, weights):
    dist = 0
    weight_sum = sum(weights.values())
    
    for col, weight in weights.items():
        if row1[col] == row2[col]:
            dist += weight
    
    normalized_dist = dist / weight_sum
    return normalized_dist

def impute_missing_values(row, col, df):
    #Si la valeur de la cellule row[col] est manquante, nous procédons au traitement.
    if pd.isnull(row[col]):
        #num est une variable utilisée pour stocker la somme pondérée des valeurs non manquantes.
        #den est une variable utilisée pour stocker la somme des proximités pondérées.
        num = 0
        den = 0

        # Calcul de la somme pondérée des valeurs non manquantes
        #Nous parcourons les autres lignes (other_row) du DataFrame df3 et calculons la proximité entre la ligne row et chaque autre ligne à l'aide de la fonction weighted_distance,
        # en utilisant la formule de proximité : proximity = 1 / (weighted_distance(row, other_row, weights) + 0.01).
        for _, other_row in df.iterrows():
            #Si la valeur de la cellule other_row[col] n'est pas manquante :
            if not pd.isnull(other_row[col]):
                proximity = 1 / (weighted_distance(row, other_row, weights) + 0.01)
                #nous mettons à jour num en ajoutant le produit de la proximité et de la valeur de la cellule 'other_row[col]'
                # nous mettons à jour den en ajoutant simplement la proximité.
                value = other_row[col]
                num += proximity * value
                den += proximity
        #Si den n'est pas égal à zéro, cela signifie qu'il y a des valeurs non manquantes pour la colonne col et nous pouvons effectuer l'imputation.
        if den != 0:
            #Nous calculons la valeur interpolée en divisant num par den.
            interpolated_value = num / den
            #Nous remplaçons la valeur manquante dans la colonne col de la ligne row du DataFrame df3 par interpolated_value.
            df.at[row.name, col] = interpolated_value
            return interpolated_value
    #Si la valeur de la cellule row[col] n'est pas manquante à l'origine, nous la renvoyons telle quelle sans la modifier.
    return row[col]

# Imputation pour les colonnes numériques
for col in ['Project Duration']:
    df5[col] = df5.apply(impute_missing_values, args=(col, df5), axis=1)



In [None]:
df5.info()

In [None]:
import collections
from collections import Counter

counter = Counter(df5['status'])
for k,v in counter.items():
	per = v / len(df5['status']) * 100
	print('Class=%s, Count=%d, Percentage=%.3f%%' % (k, v, per))

In [None]:
df5.to_csv('DF5_DF2.csv', index=False)

### Add Cost Variation Data 

In [None]:
# Load  the CSV file containing information about cost variations
df_cost_variation_common = pd.read_csv(r'C:\Users\SALMA\Desktop\python\df_cost_variation_common.csv')

In [None]:
  # Merge df5 with DF_variation using the ID column as the merge key
df_commun = df5.merge(df_cost_variation_common, on='ID', how='left')

# Affichage du jeu de données df5 avec les colonnes de variation ajoutées
df_commun.head()

In [None]:
df_commun.shape

In [None]:
df_commun.columns

#### A quick check for missing values in the dataset

In [None]:
# Select the columns that start with "Expenses", "Internal" and "External"
columns_to_check = [col for col in df_commun.columns if col.startswith(("Expenses", "Internal", "External"))]

# Count the number of projects with missing values ​​in all specified columns
count_missing_all = df_commun[columns_to_check].isnull().all(axis=1).sum()

print("Number of projects with missing values ​​in all specified columns:", count_missing_all)

In [None]:
# filter projects with status 0 among projects with missing values ​​in all specified columns
count_status_zero = df_commun[df_commun['status'] == 0][columns_to_check].isnull().all(axis=1).sum()

print("Number of projects with missing values ​​in all columns and status equal to 0:", count_status_zero)

In [None]:
# SSelect the columns that start with "Expenses", "Internal" and "External"
columns_to_check = [col for col in df_commun.columns if col.startswith(("Expenses", "Internal", "External"))]
# Filter projects with status 0 among projects with missing values ​​in all specified columns:
projects_status_zero = df_commun.loc[(df_commun['status'] == 0) & df_commun[columns_to_check].isnull().all(axis=1)]

# Show selected projects
print("Projects with missing values ​​in all columns and status equal to 0:")
print(projects_status_zero)

# save the selected projects in another DataFrame (df_status_zero)
df_status_zero = projects_status_zero.copy()

In [None]:
df_status_zero.head()

In [None]:
df_status_zero.to_csv('df_status_zero.csv', index=False)

In [None]:
df_cost_variation_Cancelled = pd.read_csv(r'C:\Users\SALMA\Desktop\python\df_cost_variation_Cancelled.csv')
df_cost_variation_Cancelled.shape

In [None]:
df_commun.update(df_cost_variation_Cancelled)
df_commun.shape

In [None]:
df_commun.info()

In [None]:
df_commun= df_commun.fillna(0)

In [None]:
import collections
from collections import Counter

counter = Counter(df_commun['status'])
for k,v in counter.items():
	per = v / len(df_commun['status']) * 100
	print('Class=%s, Count=%d, Percentage=%.3f%%' % (k, v, per))

In [None]:
df_commun.columns

In [None]:
df_avecID = df_commun.copy()

In [None]:
pd.set_option('display.max_columns', None)
df_status_0 = df_commun.loc[df_commun['status'] == 0]

In [None]:
col = ['ID', 'portfolio_level_2']
df_commun = df_commun.drop(col , axis = 1)

In [None]:
df_commun.Type.unique()

In [None]:
dummies_1 = pd.get_dummies(df_commun['project_type'])
df_commun = pd.concat([df_commun,dummies_1],axis=1)
df_commun = df_commun.drop('Type',axis=1)

In [None]:
df_commun.shape

In [None]:
df_commun.columns

In [None]:
df_commun.dtypes

In [None]:
#Graphe rprésentant la corrélation entre les featutres deux à deux
import matplotlib.pyplot as plt
import seaborn as sns
#data_f=df2.loc[1:,'status':'OPTIMIZE']
corrdata=df_commun[['portfolio_level_1', 'nature', 'Framework', 'status',
       'Program Holder', 'Program ID', 'Horizon', 'Business Case Id',
       'Project Size', 'Project Duration', 'Expenses_amount_mean',
       'Expenses_amount_max', 'Expenses_amount_min', 'Expenses_amount_std',
       'Expenses_slope_mean', 'Expenses_slope_max', 'Expenses_slope_min',
       'Expenses_slope_std', 'External Labor_amount_mean',
       'External Labor_amount_max', 'External Labor_amount_min',
       'External Labor_amount_std', 'External Labor_slope_mean',
       'External Labor_slope_max', 'External Labor_slope_min',
       'External Labor_slope_std', 'Internal Labor_amount_mean',
       'Internal Labor_amount_max', 'Internal Labor_amount_min',
       'Internal Labor_amount_std', 'Internal Labor_slope_mean',
       'Internal Labor_slope_max', 'Internal Labor_slope_min',
       'Internal Labor_slope_std', 'Type_A', 'Type_B', 
       'Type_C', 'Type_D', 'Type_E']]
corr=corrdata.corr()
plt.figure(figsize=(30,20))
#sns.set_context("poster")
sns.heatmap(corr.abs(),annot=True, annot_kws={"size":10})

#### Preprocessing

In [None]:

# Filtrer le dataframe pour les projets ayant Project Size = 142 et Project Duration = 1420
filtered_projects = df_avecID.loc[(df_avecID['Project Size'] == 130265.0) & (df_avecID['Project Duration'] == 1491)]

# Afficher les informations sur les projets filtrés
filtered_projects

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df_normalized = pd.DataFrame(scaler.fit_transform(df_commun), columns=df_commun.columns)

# Afficher le DataFrame normalisé
df_normalized

In [None]:
#Define X, y 
#define x, y and x_test
y = df_commun.status
X = df_commun.drop('status', axis = 1)
X.shape, y.shape

In [None]:
df_commun.loc[df_commun['status'] == 0]

In [None]:
import collections
from collections import Counter

counter = Counter(df_commun.status)
for k,v in counter.items():
	per = v / len(df_commun.status) * 100
	print('Class=%s, Count=%d, Percentage=%.3f%%' % (k, v, per))

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train,  y_test = train_test_split(X, y, random_state=1, test_size=0.20, stratify=y)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
y_test.value_counts()

#### Modeling

In [None]:
def make_confusion_matrix(cf,
                          group_names=None,
                          categories='auto',
                          count=True,
                          percent=True,
                          cbar=True,
                          xyticks=True,
                          xyplotlabels=True,
                          sum_stats=True,
                          figsize=None,
                          cmap='Blues',
                          title=None):
    '''
    This function will make a pretty plot of an sklearn Confusion Matrix cm using a Seaborn heatmap visualization.
    Arguments
    ---------
    cf:            confusion matrix to be passed in
    group_names:   List of strings that represent the labels row by row to be shown in each square.
    categories:    List of strings containing the categories to be displayed on the x,y axis. Default is 'auto'
    count:         If True, show the raw number in the confusion matrix. Default is True.
    normalize:     If True, show the proportions for each category. Default is True.
    cbar:          If True, show the color bar. The cbar values are based off the values in the confusion matrix.
                   Default is True.
    xyticks:       If True, show x and y ticks. Default is True.
    xyplotlabels:  If True, show 'True Label' and 'Predicted Label' on the figure. Default is True.
    sum_stats:     If True, display summary statistics below the figure. Default is True.
    figsize:       Tuple representing the figure size. Default will be the matplotlib rcParams value.
    cmap:          Colormap of the values displayed from matplotlib.pyplot.cm. Default is 'Blues'
                   See http://matplotlib.org/examples/color/colormaps_reference.html
                   
    title:         Title for the heatmap. Default is None.
    '''


    # CODE TO GENERATE TEXT INSIDE EACH SQUARE
    blanks = ['' for i in range(cf.size)]

    if group_names and len(group_names)==cf.size:
        group_labels = ["{}\n".format(value) for value in group_names]
    else:
        group_labels = blanks

    if count:
        group_counts = ["{0:0.0f}\n".format(value) for value in cf.flatten()]
    else:
        group_counts = blanks

    if percent:
        group_percentages = ["{0:.2%}".format(value) for value in cf.flatten()/np.sum(cf)]
    else:
        group_percentages = blanks

    box_labels = [f"{v1}{v2}{v3}".strip() for v1, v2, v3 in zip(group_labels,group_counts,group_percentages)]
    box_labels = np.asarray(box_labels).reshape(cf.shape[0],cf.shape[1])


    # CODE TO GENERATE SUMMARY STATISTICS & TEXT FOR SUMMARY STATS
    if sum_stats:
        #Accuracy is sum of diagonal divided by total observations
        accuracy  = np.trace(cf) / float(np.sum(cf))

        #if it is a binary confusion matrix, show some more stats
        if len(cf)==2:
            #Metrics for Binary Confusion Matrices
            precision = cf[1,1] / sum(cf[:,1])
            recall    = cf[1,1] / sum(cf[1,:])
            f1_score  = 2*precision*recall / (precision + recall)
            stats_text = "\n\nAccuracy={:0.3f}\nPrecision={:0.3f}\nRecall={:0.3f}\nF1 Score={:0.3f}".format(
                accuracy,precision,recall,f1_score)
        else:
            stats_text = "\n\nAccuracy={:0.3f}".format(accuracy)
    else:
        stats_text = ""


    # SET FIGURE PARAMETERS ACCORDING TO OTHER ARGUMENTS
    if figsize==None:
        #Get default figure size if not set
        figsize = plt.rcParams.get('figure.figsize')

    if xyticks==False:
        #Do not show categories if xyticks is False
        categories=False


    # MAKE THE HEATMAP VISUALIZATION
    plt.figure(figsize=figsize)
    sns.heatmap(cf,annot=box_labels,fmt="",cmap=cmap,cbar=cbar,xticklabels=categories,yticklabels=categories)

    if xyplotlabels:
        plt.ylabel('True label')
        plt.xlabel('Predicted label' + stats_text)
    else:
        plt.xlabel(stats_text)
    
    if title:
        plt.title(title)

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
# Running the random forest with default parameters.
rfc = RandomForestClassifier()
# Instancier la classe SMOTE
smote = SMOTE(random_state=0,sampling_strategy=0.75)

# Appliquer la méthode fit_resample pour équilibrer les données d'entraînement
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# Entraîner le modèle sur les données équilibrées
rfc.fit(X_resampled, y_resampled)

# Faire des prédictions sur les données de test
y_pred = rfc.predict(X_test)

In [None]:
# Importing classification report and confusion matrix from sklearn metrics
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score
# Let's check the report of our default model
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.metrics import precision_score, f1_score, confusion_matrix
precision = precision_score(y_test, y_pred)
f1 = f1_score(y_test  , y_pred)
print('precision: ', precision)
print('f1_score: ',f1)
confusion_matrix(y_test  , y_pred)

In [None]:
cf_matrix = confusion_matrix(y_test, y_pred)
make_confusion_matrix(cf_matrix, categories=["Cancelled", "Completed"])

In [None]:
print(Counter(y_pred))

In [None]:
print('Accuracy: ', accuracy_score(y_test, y_pred))
print('Accuracy train: ',rfc.score(X_train,y_train))

In [None]:
df_x=pd.DataFrame({'Actual': y_test, 'Predicted':y_pred})
df_x

In [None]:
def plot_feature_importance(importance,names,model_type):

     #Create arrays from feature importance and feature names
    feature_importance = np.array(importance)
    feature_names = np.array(names)

    #Create a DataFrame using a Dictionary
    data={'feature_names':feature_names,'feature_importance':feature_importance}
    fi_df = pd.DataFrame(data)

    #Sort the DataFrame in order decreasing feature importance
    fi_df.sort_values(by=['feature_importance'], ascending=False,inplace=True)

    #Define size of bar plot
    plt.figure(figsize=(10,8))
    #Plot Searborn bar chart
    sns.barplot(x=fi_df['feature_importance'], y=fi_df['feature_names'])
    #Add chart labels
    plt.title(model_type + 'FEATURE IMPORTANCE')
    plt.xlabel('FEATURE IMPORTANCE')
    plt.ylabel('FEATURE NAMES')

In [None]:
plot_feature_importance(rfc.feature_importances_,X_train.columns,'RANDOM FOREST')

#### XGBOOST_SANS_SMOTE

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from xgboost import plot_importance
pos_weight = len(y_train[y_train==1]) / len(y_train[y_train==0])


XGBmodel = XGBClassifier(scale_pos_weight =pos_weight)
XGBmodel.fit(X_train, y_train)
xgbprediction = XGBmodel.predict(X_test)
print('Accuracy of xgboost:', accuracy_score(xgbprediction,y_test))
plot_importance(XGBmodel)
plt.show()

In [None]:
# Importing classification report and confusion matrix from sklearn metrics
from sklearn.metrics import classification_report
# Let's check the report of our default model
print(classification_report(y_test, xgbprediction))

In [None]:
cf_matrix = confusion_matrix(y_test, xgbprediction)
make_confusion_matrix(cf_matrix, categories=["Cancelled", "Completed"])

#### XGBOOST_AVEC_SMOTE

In [None]:
from xgboost import XGBClassifier

XGBmode_SMOTE = XGBClassifier()
# Instancier la classe SMOTE
smote = SMOTE(random_state=0, sampling_strategy=0.74)

# Appliquer la méthode fit_resample pour équilibrer les données d'entraînement
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# Entraîner le modèle sur les données équilibrées
XGBmode_SMOTE.fit(X_resampled, y_resampled)

xgbprediction2 = XGBmode_SMOTE.predict(X_test)


In [None]:
from sklearn.metrics import precision_score, f1_score, confusion_matrix
precision = precision_score(y_test, xgbprediction2)
f1 = f1_score(y_test  , xgbprediction2)
print('precision: ', precision)
print('f1_score: ',f1)
confusion_matrix(y_test  , xgbprediction2)

In [None]:
cf_matrix = confusion_matrix(y_test, xgbprediction2)
make_confusion_matrix(cf_matrix, categories=["Cancelled", "Completed"])

In [None]:
from sklearn.ensemble import AdaBoostClassifier
adaboostclassifier = AdaBoostClassifier()
# Instancier la classe SMOTE
smote = SMOTE(random_state=0, sampling_strategy=0.8)

# Appliquer la méthode fit_resample pour équilibrer les données d'entraînement
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)


# Entraîner le modèle sur les données équilibrées
adaboostclassifier.fit(X_resampled, y_resampled)

adaboostprediction = adaboostclassifier.predict(X_test)
print('Accuracy of Ada Boost:', accuracy_score(adaboostprediction,y_test))

In [None]:
from sklearn.metrics import precision_score, f1_score, confusion_matrix
precision = precision_score(y_test, adaboostprediction)
f1 = f1_score(y_test  , adaboostprediction)
print('precision: ', precision)
print('f1_score: ',f1)
confusion_matrix(y_test  , adaboostprediction)

In [None]:
import pickle
pickle.dump(XGBmode_SMOTE, open("pima.pickle.dat", "wb"))

In [None]:
loaded_model = pickle.load(open("pima.pickle.dat", "rb"))

In [None]:
probability = XGBmode_SMOTE.predict_proba(X_test)[:,1]
probability

In [None]:
#data=[[1,38,1,0,1,1]] #=1
#data1=[[1,38,1,1,1,1]] #= 0
input_data= []
features=[.........]

for i in features:
  print(f'Donner  {i} :')
  a=input()
  input_data.append(float(a))
print(input_data)