In [92]:
cacf = False

In [93]:
import pyodbc
import pandas as pd
import os
from dotenv import load_dotenv
import datetime
import locale
from fonctions import *
import plotly.express as px

# Définir la langue française
locale.setlocale(locale.LC_TIME, "fr_FR.UTF-8")

if cacf:
    # Load des var env
    load_dotenv()

    server = os.getenv('SERVER')
    database = os.getenv('DATABASE')
    username = os.getenv('USERNAME')
    password = os.getenv('PASSWORD')
    cnxn = pyodbc.connect('DRIVER={ODBC Driver 18 for SQL Server};SERVER='+server+';DATABASE='+database+';UID='+username+';PWD='+ password)
    cursor = cnxn.cursor()

    query = "SELECT * FROM dbo.Telephonie;"
    df_tel = pd.read_sql(query, cnxn)

    query = "SELECT * FROM dbo.Eptica;"
    df_ept = pd.read_sql(query, cnxn)
else:
    df_tel = pd.read_csv('tel.csv')
    df_tel = df_tel.drop("Unnamed: 0", axis=1)

In [94]:
df_tel.shape

(23783, 5)

In [95]:
df_tel.columns

Index(['id', 'date_appel', 'entite', 'famille', 'nombre_entrants_corrige'], dtype='object')

In [96]:
df_tel['date_appel'] = pd.to_datetime(df_tel['date_appel']) # On encode la column "date_appel" pour l'exploiter en timeseries
df_tel['nombre_entrants_corrige'] = df_tel['nombre_entrants_corrige'].apply(lambda x: x if x > 1 else 1).astype(int) # On transforme les float en Int car un demi appel n'existe pas
df_tel.head()

Unnamed: 0,id,date_appel,entite,famille,nombre_entrants_corrige
0,12,2019-01-02,Entite 1,F2,165
1,13,2019-01-02,Entite 1,F3,54
2,14,2019-01-02,Entite 1,F4,1
3,15,2019-01-02,Entite 1,F5,99
4,16,2019-01-02,Entite 1,F6,1487


In [97]:
# Nombre d'entités (5)
df_tel['entite'].unique()

array(['Entite 1', 'Entite 2', 'Entite 3', 'Entite 4', 'Entite 5'],
      dtype=object)

In [98]:
# Nombre de famille (7)
df_tel['famille'].unique()

array(['F2', 'F3', 'F4', 'F5', 'F6', 'F7', 'F1'], dtype=object)

In [99]:
# On définit nos dates comme index
df_tel = df_tel.set_index(df_tel['date_appel'].rename("date")).drop("date_appel", axis=1)

# Renome la columns appels reçus
df_tel.rename(columns={"nombre_entrants_corrige":"nb_recus"}, inplace=True)

In [100]:
# Ajout des colonnes Jour en Chiffres, Mois, année
df_tel = df_tel.reset_index()
df_tel['jour_int']= df_tel['date'].apply(lambda x: int(x.day))
df_tel['mois']= df_tel['date'].apply(lambda x: str(x.strftime("%B")))
df_tel['annee']= df_tel['date'].apply(lambda x: int(x.year))

In [101]:
# Ajout d'une column jour
df_tel = df_tel.reset_index()
df_tel['jour'] = df_tel['date'].apply(lambda x: str(datetime.datetime(int(x.year), int(x.month), int(x.day)).strftime("%A")))
df_tel = df_tel.set_index('date')

In [102]:
# 2 dernières année
two_last_year = True
if two_last_year:
    df_tel = df_tel.reset_index()
    df_tel = df_tel.loc[(df_tel['date'] > "2020-12-31") & (df_tel['date'] < "2023-01-01")]
    df_tel = df_tel.set_index("date")

In [103]:
df = df_tel.groupby(pd.Grouper(freq = 'Y')).agg(nb_recus_avg = ('nb_recus' , 'mean')).reset_index()
fig = px.line(df, x="date", y="nb_recus_avg")
fig.show()

In [119]:
#
# Création de graphique dynamique de toutes les Entités avec toutes les Familles
#

activer_samedi = True

for entite in df_tel.entite.unique():
    if activer_samedi:
      df = df_tel.loc[(df_tel.entite == entite)].groupby([pd.Grouper(freq = 'd'), "famille", "jour"]).agg(nb_recus = ('nb_recus' , 'sum')).reset_index()
    else:
       df = df_tel.loc[(df_tel.entite == entite) & (df_tel.jour != "Samedi")].groupby([pd.Grouper(freq = 'd'), "famille", "jour"]).agg(nb_recus = ('nb_recus' , 'sum')).reset_index()
    
    fig = px.line(df, x="date", y="nb_recus", color='famille',
              hover_data={
                "date": "|%B %d, %Y",
                "jour" : True,
              },
              title=f"Nombre d'appel pour {entite}")

    fig.update_xaxes(
        dtick="M1",
        tickformat="%b\n%Y",
        ticklabelmode="period")
    fig.show()

In [105]:
import plotly.express as px

fig = px.pie(df_tel, values='nb_recus', names='entite', title="Nombre d'appel reçus par Entité")
fig.show()

In [106]:

fig = px.bar(df_tel.groupby([pd.Grouper(freq = 'm'), 'entite']).agg(nb_recus = ("nb_recus" , "sum")).reset_index(), x='date', y='nb_recus', color="entite")
fig.show()

In [107]:
fig = px.bar(df_tel.groupby(["entite", "jour", "famille"]).agg(nb_recus = ('nb_recus' , 'sum')).reset_index(), x="jour", y="nb_recus", color="entite", title="Nombre d'appels recus par entite (jour)",
            hover_data={
                "famille" : True,
                "entite" : True,
            })
fig.show()

In [108]:
fig = px.bar(df_tel.groupby(["entite", "mois", "famille"]).agg(nb_recus = ('nb_recus' , 'sum')).reset_index(), x="mois", y="nb_recus", color="entite", title="Nombre d'appels recus par entite (mois)",
            hover_data={
                "famille" : True,
                "entite" : True,
            })
fig.show()

fig = px.line(df_tel.groupby(["entite", "mois"]).agg(nb_recus = ('nb_recus' , 'mean')).reset_index(), x="mois", y="nb_recus", color="entite", title="Moyenne d'appel reçus par les entités par mois",
            hover_data={
                "entite" : True,
            })
fig.show()

In [109]:

fig = px.bar(df_tel.groupby([pd.Grouper(freq="w"), "entite"]).agg(nb_recus = ("nb_recus", "sum")).reset_index(), x="date", y="nb_recus", color="entite", title="Nb Appel par semaine par entité")
fig.show()

In [110]:
df_tel = df_tel.reset_index()
df_tel['vacances'] = df_tel['date'].astype(str).apply(lambda x: is_vacances(x)).astype(int)
df_tel = df_tel.set_index("date")

In [111]:
df_tel.describe() # On cherche le top 25%

Unnamed: 0,index,id,nb_recus,jour_int,annee,vacances
count,11304.0,11304.0,11304.0,11304.0,11304.0,11304.0
mean,17375.5,17387.5,582.277778,15.817144,2021.502919,0.324575
std,3263.328056,3263.328056,705.605331,8.773442,0.500014,0.468237
min,11724.0,11736.0,1.0,1.0,2021.0,0.0
25%,14549.75,14561.75,107.0,8.0,2021.0,0.0
50%,17375.5,17387.5,401.0,16.0,2022.0,0.0
75%,20201.25,20213.25,775.0,23.0,2022.0,1.0
max,23027.0,23039.0,4667.0,31.0,2022.0,1.0


In [112]:
df_tel.loc[df_tel['nb_recus'] > 775]['jour'].value_counts() # Réparatition du nombre de jour dans le top 25%

jour
Lundi       611
Mardi       605
Mercredi    561
Jeudi       510
Vendredi    496
Samedi       41
Name: count, dtype: int64

In [113]:
df_tel.loc[df_tel['nb_recus'] < 86]['jour'].value_counts() # Répartition des plus petits jours

jour
Vendredi    470
Mercredi    461
Mardi       455
Jeudi       454
Lundi       433
Samedi      396
Name: count, dtype: int64

In [114]:
df = df_tel.groupby([pd.Grouper(freq="d"), "jour", "entite"]).agg(nb_recus = ("nb_recus", "sum")).reset_index()

fig = px.line(df, x="date", y="nb_recus", color='entite',
          hover_data={
            "date": "|%B %d, %Y",
            "jour" : True,
          },
          title=f"Nombre d'appel par Entite")

fig.update_xaxes(
    dtick="M1",
    tickformat="%b\n%Y",
    ticklabelmode="period")
fig.show()

## Affichage des familles

In [115]:
activer_samedi = True

for famille in df_tel['famille'].unique():
    if activer_samedi:
      df = df_tel.loc[df_tel['famille'] == famille].groupby([pd.Grouper(freq="d"), "entite"]).agg(nb_recus =("nb_recus", "sum")).reset_index()
    else:
       df = df_tel.loc[(df_tel.famille == famille) & (df_tel.jour != "Samedi")].groupby([pd.Grouper(freq="d"), "entite"]).agg(nb_recus =("nb_recus", "sum")).reset_index()

    fig = px.line(df, x="date", y="nb_recus", color="entite",
          hover_data={
            "date": "|%B %d, %Y",
            "entite" : True,
          },
          title=f"Nombre d'appel par Entite pour la famille {famille}")
    fig.show()

# Dummy model (Moyenne)

In [116]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler, OneHotEncoder, StandardScaler, PolynomialFeatures, OrdinalEncoder, LabelEncoder
import numpy as np
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import train_test_split

cat_var = ['famille','jour', 'mois', 'jour_int']
num_var = ['nb_recus', 'vacances']

## Transformers Pipeline
cat_transformer = OneHotEncoder(handle_unknown='ignore')
num_transformer = StandardScaler()

encoder = ColumnTransformer(
    transformers=[
        ('cat', cat_transformer, cat_var),
        ('num', num_transformer, num_var)
    ],
)

In [117]:
# Pipeline
pipeline = Pipeline([
    ('enc', encoder),
    ('model', DummyRegressor(strategy="mean")),
])

In [118]:
# Jeu de données
df = df_tel.reset_index()
df = df[(df['date'] > '2019-12-31') & (df['date'] > '2023-01-01')].drop(["id", "date"], axis=1)

y = df['nb_recus']
X = df

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False, train_size=0.70, random_state=42)

ValueError: With n_samples=0, test_size=None and train_size=0.7, the resulting train set will be empty. Adjust any of the aforementioned parameters.

In [None]:
# Entrainement
pipeline.fit(X_train, y_train)

In [None]:
pipeline.score(X_test, y_test)