In [1]:
## Import des librairies et modules

# Calculs & dataframes
import pandas as pd
import numpy as np
from math import pi

# ACP et K-means
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler # pour centrer-réduire les données
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Régression logistique
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, roc_auc_score, plot_roc_curve
from sklearn.model_selection import train_test_split
from scipy.stats import norm

# Librairies graphiques
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib import cm
from matplotlib.colors import ListedColormap, LinearSegmentedColormap
import seaborn as sns
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

In [None]:
df = pd.read_csv('datas/notes.csv' , header=0, sep=",", decimal=".")
df.shape
df.describe(include = 'all')
df.info()
df.isnull().sum()
print(df.duplicated().sum())

In [None]:
data = df.groupby("is_genuine").count()

data.plot.pie(y="diagonal",figsize=(8, 8),
                                explode = [0, 0.1],
                                labels = ["Faux billets", "Vrais billets"],
                                autopct = '%1.1f%%',
                                pctdistance = 0.3, labeldistance = 0.5)

plt.title('Répartition des vrais et faux billets du jeu de données ', 
  loc='center', 
  fontsize=22)
plt.savefig('graphiques/01. DESCRIBE_repartition_vrai_faux.png')
plt.show()



In [None]:
#Le calcul de la distribution empirique est la première étape pour la représentation graphique d'une variable qualitative

for column in df[['diagonal', 'height_left', 'height_right', 'margin_low', 'margin_up', 'length']] :
    plt.figure(figsize = (8,5))
    sns.histplot(x=column, data=df, kde=True, color='#2cb2ff')
    plt.savefig("graphiques/02. Histogramme " + str(column) +".jpg", dpi=500, bbox_inches='tight', pad_inches=0.5)

    plt.xlabel(column)
    
    
    
    #normality of variables in df
import pingouin as pg
pg.normality(df, method='shapiro', alpha=0.05).drop('is_genuine')

#normality: test de normalité univarié.

In [None]:
#Analyse Bivariée

for column in df[['diagonal', 'height_left', 'height_right', 'margin_low', 'margin_up', 'length']] : 
        plt.figure(figsize = (8,5))
        sns.boxplot( x=column, y='is_genuine', data=df, orient='h', palette=('#4cb2ff', '#61ba86'))
        plt.savefig("graphiques/03. Boxplot " + str(column) +".jpg", dpi=500, bbox_inches='tight', pad_inches=0.5)

In [None]:
g = sns.pairplot(df, hue='is_genuine', markers=['o','s'], corner=True)
g.map_lower(sns.kdeplot, levels=2, color='.2')
plt.savefig("graphiques/04. Pairplot.jpg", dpi=500, bbox_inches='tight', pad_inches=0.5)
plt.show()

In [None]:
#convert bool to int
df['is_genuine'] = df['is_genuine'].astype(int)

plt.figure(figsize=(15,5))
mask = np.zeros_like(df.corr())
mask[np.triu_indices_from(mask)] = True
sns.heatmap(df.corr(), mask=mask, vmin=-1, vmax=1, annot=True, cmap='BrBG')
plt.xticks(rotation=25, ha='right')
plt.title('Triangle de Corrélation',  fontsize=18, pad=20)
plt.savefig("graphiques/05. Triangle de corrélation.jpg", dpi=500, bbox_inches='tight', pad_inches=0.5)
    #plt.show()