# Taux de recherche fructueuse de convention collective sur les outils et sur les contribs

**Définition** : Part des personnes qui sélectionnent une convention collective par rapport au nombre de personnes qui en recherche une (nombre de session où il y a au moins 1 CC search ?)
- [ ] Pour les outils
- [ ] Pour les contrib

**Objectif** : voir parmi ceux qui cherchent une CC combien en sélectionnent une

**Formules** : 
- Pour les outils = ??
- Pour les contrib = nb de sessions ayant au moins un cc_select / nb de sessions ayant au moins un cc_search

## On commence par le chargement des données

In [None]:
import pandas as pd
import time
import json
import math
from tqdm.notebook import tqdm
import plotly.express as px
import plotly.graph_objects as go
import sys
from datetime import datetime
#from typing import List, Dict


pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

## On récupère les données via celle déjà télécharger

In [None]:
from join_multiple_file import pd_read_pattern, PATH_CDTN_MONOLOG

logs_april = pd_read_pattern(PATH_CDTN_MONOLOG + "/data-all-logs-april/*")
logs_may = pd_read_pattern(PATH_CDTN_MONOLOG + "/data-all-logs-may/*")
logs_june = pd_read_pattern(PATH_CDTN_MONOLOG + "/data-all-logs-june/*")
#logs_july = pd_read_pattern(PATH_CDTN_MONOLOG + "/data-all-logs-july/*")

In [None]:
from query_elastic_search import load_file_if_exists_or_execute_query

QUERY = {
    "query": {
        "bool": { 
          "must": [
            {
              "range": {
               "logfile": {
                  "gte": "2022-07-01",
                  "lt": "2022-08-01"
               }
              }
            }
          ]
        }
      }
}

In [None]:
logs_july = load_file_if_exists_or_execute_query("", QUERY)

In [None]:
logs_july.head()

In [None]:
logs = pd.concat([logs_april, logs_may, logs_june, logs_july])

In [None]:
logs['url'] = logs['url'].str.split('#').str[0].str.split('?').str[0]
logs['month'] = pd.DatetimeIndex(logs['logfile']).month
logs['datetime'] = logs['timestamp'].apply(datetime.fromtimestamp)

logs_contrib_without_duplicates = logs_contrib.drop_duplicates(subset=['idVisit', 'url', 'type']).copy()

logs_contrib_without_duplicates.shape[0]

# KPI 4 : Taux de recherche fructueuse de convention collective sur les outils

Commentaire

### Préparation des données

In [None]:
logs_tools = logs[logs['url'].str.match('^https://code.travail.gouv.fr/outils', na=False)].copy()

In [None]:
logs_tools_with_cc_search = logs_tools[
    (logs_tools['url'].str.match('^https://code.travail.gouv.fr/outils/preavis-demission', na=False)) |
    (logs_tools['url'].str.match('^https://code.travail.gouv.fr/outils/indemnite-precarite', na=False)) |
    (logs_tools['url'].str.match('^https://code.travail.gouv.fr/outils/preavis-licenciement', na=False)) |
    (logs_tools['url'].str.match('^https://code.travail.gouv.fr/outils/heures-recherche-emploi', na=False)) |
    (logs_tools['url'].str.match('^https://code.travail.gouv.fr/outils/preavis-retraite', na=False)) |
    (logs_tools['url'].str.match('^https://code.travail.gouv.fr/outils/convention-collective', na=False))
].copy()

In [None]:
logs_tools_without_duplicates = logs_tools_with_cc_search.drop_duplicates(
    subset=['idVisit', 'url', 'type', 'month']
).copy()

In [None]:
# On sélectionne les données de juin pour un premier cas d'usage
logs_tools_without_duplicates_june = logs_tools_without_duplicates[logs_tools_without_duplicates['month']==6]

In [None]:
logs_tools_type_value_counts_june = logs_tools_without_duplicates_june.type.value_counts()
logs_tools_type_value_counts_june

## 4.1 taux de recherches fructueuses de CC dans les outils
NB : plusieurs recherches / selection de CC comptent comme une recherche si elles sont faites par un même visiteur
### Première étape

In [None]:
nb_total_cc_search_on_tools = logs_tools_type_value_counts_june['cc_search'] 
nb_total_cc_search_on_tools

### Deuxième étape

In [None]:
nb_total_cc_select_p1_on_tools = logs_tools_type_value_counts_june['cc_select_p1'] 
nb_total_cc_select_p1_on_tools

### Présentation des résultats

In [None]:
print(f"{round(( nb_total_cc_select_p1_on_tools) / nb_total_cc_search_on_tools * 100 , 2)} %")

## 4.2 taux de recherches fructueuses d'entreprise dans les outils par utilisateur 
NB : plusieurs recherches / selection de CC comptent comme une recherche si elles sont faites par un même visiteur
### Première étape

In [None]:
nb_total_enterprise_search_on_tools = logs_tools_type_value_counts_june['enterprise_search']
nb_total_enterprise_search_on_tools

### Deuxième étape

In [None]:
nb_total_enterprise_select_on_tools = logs_tools_type_value_counts_june['enterprise_select']
nb_total_enterprise_select_on_tools

### Présentation des résultats

In [None]:
print(f"{round(( nb_total_enterprise_select_on_tools) / nb_total_enterprise_search_on_tools * 100 , 2)} %")

### 4.1 et 4.2 Généralisation pour chaque mois

In [None]:
def get_ratio_of_a_given_select_type_over_a_given_search_type_in_logs(logs, search_type: str, select_type: str):
    logs_type_value_counts = logs.type.value_counts()
    
    nb_total_search_type = logs_type_value_counts[search_type] if search_type in logs_type_value_counts else sys.maxsize
    nb_total_select_type = logs_type_value_counts[select_type] if select_type in logs_type_value_counts else 0
    
    return nb_total_search_type, nb_total_select_type, round(nb_total_select_type / nb_total_search_type * 100 , 2)


def get_ratio_of_successful_search_for_cc_in_tools(logs_tools):
    return get_ratio_of_a_given_select_type_over_a_given_search_type_in_logs(
        logs_tools, 'cc_search', 'cc_select_p1'
    )


def get_ratio_of_successful_search_for_enterprise_in_tools(logs_tools):
    return get_ratio_of_a_given_select_type_over_a_given_search_type_in_logs(
        logs_tools, 'enterprise_search', 'enterprise_select'
    )

In [None]:
def compute_kpi_month_by_month(kpi_computation, logs: pd.DataFrame, months_number=[4, 5, 6, 7]):
    list_kpi = []
    for month in months_number: 
        logs_without_duplicates = logs.drop_duplicates(subset=['idVisit', 'url', 'type', 'month']).copy()
        logs_without_duplicates_month = logs_without_duplicates[logs_without_duplicates['month']==month]
        
        denominator, numerator, ratio = kpi_computation(
            logs_without_duplicates_month)
        
        list_kpi.append([f"2022-{month}-01", denominator, numerator, ratio])
    return pd.DataFrame(columns=['date', 'denominator', 'numerator', 'ratio'], data=list_kpi, index=months_number)

In [None]:
kpi_tools_cc_search = compute_kpi_month_by_month(
    get_ratio_of_successful_search_for_cc_in_tools, logs_tools_with_cc_search
)
kpi_tools_cc_search

In [None]:
kpi_tools_enteprise_search = compute_kpi_month_by_month(
    get_ratio_of_successful_search_for_enterprise_in_tools, logs_tools_with_cc_search
)
kpi_tools_enteprise_search

### Présentation Graphique du résultat

In [None]:
fig = px.line(kpi_tools_cc_search, x='date', y='ratio', markers=True)
fig.update_xaxes(
    dtick="M1",
    tickformat="%b")
fig.update_yaxes(range=[0, 100])
fig.show()

In [None]:
fig = px.line(kpi_tools_enteprise_search, x='date', y='ratio', markers=True)
fig.update_xaxes(
    dtick="M1",
    tickformat="%b")
fig.update_yaxes(range=[0, 100])
fig.show()

### 4.3 Analyse sur le taux de recherches fructueuses d'entreprise dans les outils parmis l'ensemble des recherches

In [None]:
import itertools

In [None]:
logs_tools_with_cc_search[
    ((logs_tools_with_cc_search['type']=='enterprise_search') |
    (logs_tools_with_cc_search['type']=='enterprise_select')) &
    (logs_tools_with_cc_search['idVisit']==17977979)
].drop_duplicates(
    subset=['idVisit', 'url', 'type', 'month', 'datetime']
).copy()

In [None]:
def count_non_adjacent_occurences_of_enterprise_search(types):
    types_deduplicated_and_ordred = [key for key,_value in itertools.groupby(types)]
    return types_deduplicated_and_ordred.count('enterprise_search')

def count_non_adjacent_occurences_of_enterprise_select(types):
    types_deduplicated_and_ordred = [key for key,_value in itertools.groupby(types)]
    return types_deduplicated_and_ordred.count('enterprise_select')

In [None]:
def get_ratio_of_searches_with_sucessful_result_over_all_cc_searches(logs: pd.DataFrame, months_number=[4, 5, 6, 7]):
    list_kpi = []
    for month in months_number: 
        logs_wout_duplicates = logs[(logs['type']=='enterprise_search')|(logs['type']=='enterprise_select')]\
            .drop_duplicates(subset=['idVisit', 'url', 'type', 'month', 'datetime'])\
            .sort_values(by=['idVisit', 'datetime']).copy()
        
        logs_wout_duplicates_month = logs_wout_duplicates[logs_wout_duplicates['month']==month]
        
        number_occurences_of_enterprise_search_and_select = logs_wout_duplicates_month\
            .groupby(by=['url', 'idVisit'])\
            .agg({
                'type': [
                    count_non_adjacent_occurences_of_enterprise_search, 
                    count_non_adjacent_occurences_of_enterprise_select
                ]
            }).reset_index()
        
        number_occurences_of_enterprise_search_and_select.columns = [
            '_'.join(col) for col in number_occurences_of_enterprise_search_and_select.columns.values
        ]

        denominator, numerator, ratio = number_occurences_of_enterprise_search_and_select['type_count_non_adjacent_occurences_of_enterprise_search'].sum(), \
            number_occurences_of_enterprise_search_and_select['type_count_non_adjacent_occurences_of_enterprise_select'].sum(), \
            round(number_occurences_of_enterprise_search_and_select['type_count_non_adjacent_occurences_of_enterprise_select'].sum() / number_occurences_of_enterprise_search_and_select['type_count_non_adjacent_occurences_of_enterprise_search'].sum()*100, 2)
        
        list_kpi.append([f"2022-{month}-01", denominator, numerator, ratio])
    return pd.DataFrame(columns=['date', 'denominator', 'numerator', 'ratio'], data=list_kpi, index=months_number)

In [None]:
kpi_tools_enteprise_search = get_ratio_of_searches_with_sucessful_result_over_all_cc_searches(
    logs_tools_with_cc_search
)
kpi_tools_enteprise_search

# KPI 5 : Taux de recherche fructueuse de convention collective sur les contribs

**Formules** : 
Pour les contrib = nb de sessions ayant au moins un cc_select / nb de sessions ayant au moins un cc_search

### Préparation des données

In [None]:
logs_contrib = logs[logs['url'].str.match('^https://code.travail.gouv.fr/contribution/', na=False)].copy()

In [None]:
logs_contrib_without_duplicates = logs_contrib.drop_duplicates(subset=['idVisit', 'url', 'type', 'month']).copy()
logs_contrib_without_duplicates

In [None]:
logs_contrib_without_duplicates_june = logs_contrib_without_duplicates[logs_contrib_without_duplicates['month']==6]

In [None]:
logs_contrib_type_value_counts = logs_contrib_without_duplicates_june.type.value_counts()
logs_contrib_type_value_counts

### Première étape : calcul du dénominateur, cad le nombre de cc_search

In [None]:
nb_total_cc_search_on_contrib = logs_contrib_type_value_counts['cc_search']
nb_total_cc_search_on_contrib

### Deuxième étape

In [None]:
nb_total_cc_select_on_contrib = logs_contrib_type_value_counts['cc_select']
nb_total_cc_select_on_contrib

### Présentation des résultats

In [None]:
print(f"{round( nb_total_cc_select_on_contrib / nb_total_cc_search_on_contrib * 100 , 2)} %")

### Synthèse

In [None]:
def get_ratio_of_successful_search_for_cc_in_contribution(logs_contrib):
    return get_ratio_of_a_given_select_type_over_a_given_search_type_in_logs(
        logs_contrib, 'cc_search', 'cc_select'
    )

In [None]:
kpi_contrib_cc_search = compute_kpi_month_by_month(
    get_ratio_of_successful_search_for_cc_in_contribution, logs_contrib
)
kpi_contrib_cc_search

### Présentation Graphique du résultat

In [None]:
fig = px.line(kpi_contrib_cc_search, x='date', y='ratio', markers=True)
fig.update_xaxes(
    dtick="M1",
    tickformat="%b")
fig.update_yaxes(range=[0, 100])
fig.show()