# Nom du notebook

**Définition** : Ce que je veux faire

**Formule** : 

## On commence par le chargement des données
on décommente ce dont on a besoin

In [None]:
import pandas as pd
import time
import json
import math
from tqdm.notebook import tqdm
import plotly.express as px
import plotly.graph_objects as go
from datetime import datetime
from src.elasticsearch_connector import ElasticsearchConnector
#from typing import List, Dict


pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

## Si on veut lire en requêtant Elastic Search

In [None]:
es_connector = ElasticsearchConnector(env='monolog')

QUERY = {
    "query": {
        "bool": { 
          "must": [
            {
              "prefix": {
                "url": "https://code.travail.gouv.fr/contribution/" 
              }
            },
            {
              "range": {
               "logfile": {
                  "gte": "2022-04-01",
                  "lt": "2022-05-01"
               }
              }
            }
          #  {
          #    "range": {
          #     "lastActionDateTime": {
          #        "gte": "2022-04-01 00:00:00",
          #        "lt": "2022-05-01 00:00:00"
          #     }
          #    }
          #  },
          ]
        }
      }
}

In [None]:
logs_april_contrib = es_connector.execute_query(QUERY, "logs-new")

## Ou bien, si on veut lire directement un dossier contenant les fichiers déjà DL

In [None]:
from join_multiple_file import pd_read_pattern, PATH_CDTN_MONOLOG

logs_april = pd_read_pattern(PATH_CDTN_MONOLOG + "/data-all-logs-november/*")

In [None]:
logs_april.shape

In [None]:
logs_april.head()

In [None]:
def filter_df_startwith_url(df: pd.DataFrame, url):
    return df[df['url'].str.match(f'^{url}', na=False)].copy()

In [None]:
def format_url_and_get_month_and_datetime_columns(df: pd.DataFrame):
    df['url'] = df['url'].str.split('#').str[0].str.split('?').str[0]
    df['month'] = pd.DatetimeIndex(df['logfile']).month
    df['datetime'] = df['timestamp'].apply(datetime.fromtimestamp)
    return df

In [None]:
def clean_dataset_from_useless_rows(df: pd.DataFrame):
    #df = df.drop_duplicates(subset=['idVisit', 'url', 'type']).copy()
    return df[df['type'].isin(['visit_content', 'outil', 'themes', 'select_result', 'search', 'home',
                                    'pagecc_clickcontrib', 'modeles_courriers'])]

In [None]:
logs_april = format_url_and_get_month_and_datetime_columns(logs_april)

In [None]:
logs_april_contrib = filter_df_startwith_url(logs_april, 'https://code.travail.gouv.fr/contribution/')

In [None]:
logs_april_contrib_without_duplicates = logs_april_contrib.drop_duplicates(subset=['idVisit', 'url', 'type']).copy()

In [None]:
logs_april_contrib_without_duplicates.shape[0]

# Début de l'explo

Commentaire

### Préparation des données

### Première étape

In [None]:
visits_on_contrib_without_idcc = logs_april_contrib_without_duplicates[
    logs_april_contrib_without_duplicates['url'].str.match('^https://code.travail.gouv.fr/contribution/[a-zA-Z]+', 
                                                     na=False)
]

In [None]:
nb_total_visits_on_contrib_without_idcc = visits_on_contrib_without_idcc.type.value_counts()['visit_content']
nb_total_visits_on_contrib_without_idcc

### Deuxième étape

In [None]:
cc_select_on_contrib_without_idcc = visits_on_contrib_without_idcc.type.value_counts()['cc_select']
cc_select_on_contrib_without_idcc

### Présentation des résultats

In [None]:
print(f"{round(( cc_select_on_contrib_without_idcc) / nb_total_visits_on_contrib_without_idcc * 100 , 2)} %")

### Présentation Graphique du résultat