In [None]:
import pandas as pd
import time
import json
import math
from tqdm.notebook import tqdm
import plotly.express as px
import plotly.graph_objects as go
from typing import List, Dict


pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

In [None]:
from query_elastic_search import load_file_if_exists_or_execute_query

In [None]:
FILE_NAME = 'log_urls_convention_collective_from_2021_01_to_2022_03.csv'

In [None]:
QUERY = {
    "query": {
        "bool": { 
          "must": [
            {
              "prefix": {
                "url": "https://code.travail.gouv.fr/outils/convention-collective" 
              }
            },
            {
              "range": {
               "lastActionDateTime": {
                  "gte": "2021-01-01 00:00:00",
                  "lt": "2022-04-01 00:00:00"
               }
              }
            }
          ]
        }
      }
}

In [None]:
df = load_file_if_exists_or_execute_query(FILE_NAME, QUERY)

In [None]:
df.to_csv(FILE_NAME)

In [None]:
logs_cc = df.copy()

In [None]:
logs_cc.shape

In [None]:
logs_cc.head()

from elasticsearch import Elasticsearch
ES_HOST = "https://804b8d99b80941e2a99f551cd481fb5b.westeurope.azure.elastic-cloud.com:9243"
es = Elasticsearch([ES_HOST], http_auth=('fabienroussel', "cdtn-team"), timeout=36600)

es.ping()

FILE_NAME = 'logs_new_from_2021_01_to_2022_04.csv'

def init_query():
    search_body = {
       "query":{
          "range":{
             "lastActionDateTime":{
                "gte":"2021-04-01 00:00:00",
                "lt":"2022-01-01 00:00:00"
             }
          }
       }
    }
    resp = es.search(
            index = "logs-new",
            body = search_body,
            scroll = '100m', # time value for search
            size=10000,
        )
    scroll_id = resp["_scroll_id"]
    num_hits = resp["hits"]["total"]["value"]
    return scroll_id, num_hits

def scroll_query(scroll_id):
    resp = es.scroll(
        scroll_id = scroll_id,
        scroll="10m",
    )
    return resp["hits"]["hits"]

try:
    logs_new = pd.read_csv(FILE_NAME)
except:
    scroll_id, num_hits = init_query()
    step_size = 10_000
    data = []
    with tqdm(total=num_hits) as pbar:
        for i in range(math.ceil(num_hits/step_size)):
            data.extend(scroll_query(scroll_id))
            # voir comment sauvegarder au fur et a mesure
            pbar.update(step_size)
    
    logs_new = pd.DataFrame([d['_source'] for d in data])
    logs_new.to_csv(FILE_NAME)

In [None]:
logs_cc.info()

In [None]:
logs_cc.type.value_counts()

In [None]:
logs_cc['lastActionDate'] = pd.to_datetime(logs_cc['lastActionDateTime']).dt.date

cc_logs_new = logs_new[
    logs_new['url'].str.startswith('https://code.travail.gouv.fr/outils/convention-collective', na=False)
].copy()
cc_logs_new

## Séparation du jeux de données des logs avant vs après 30.06.2021

In [None]:
date_2021_06_30 = pd.to_datetime("2021-06-30").date()
logs_cc_before_30_06_2021 = logs_cc[logs_cc['lastActionDate']<date_2021_06_30]
logs_cc_after_30_06_2021 = logs_cc[logs_cc['lastActionDate']>date_2021_06_30]

In [None]:
logs_cc_before_30_06_2021.lastActionDateTime.max()

In [None]:
logs_cc_after_30_06_2021.lastActionDateTime.min()

## Taux de complétion de l'outil convention collective sur l'ancien parcours (avant 30.06.2021)

In [None]:
column = ['Mois', 'cc_select/cc_search', 'cc_select/nb_visiteurs', 'nb visiteurs', 'nb cc_search', 'nb cc_select']

In [None]:
logs_cc_before_30_06_2021.type.value_counts()

In [None]:
cc_logs_by_id_visit = logs_cc_before_30_06_2021.groupby(by=['idVisit']).agg({
        'lastActionDate': 'first',
        'type': lambda x: list(x),
        'timeSpent': lambda x: list(x),
        'url': lambda x: list(x),
    }).reset_index()

cc_logs_by_id_visit['month'] = pd.DatetimeIndex(cc_logs_by_id_visit['lastActionDate']).month
cc_logs_by_id_visit['year'] = pd.DatetimeIndex(cc_logs_by_id_visit['lastActionDate']).year

cc_logs_by_id_visit

In [None]:
cc_logs_by_id_visit = cc_logs_by_id_visit.assign(has_visit=True)

cc_logs_by_id_visit['has_user_done_a_cc_search'] = cc_logs_by_id_visit['type'].apply(
    lambda types: 'cc_search' in types
)

cc_logs_by_id_visit['has_user_done_a_cc_select'] = cc_logs_by_id_visit['type'].apply(
    lambda types: 'cc_select' in types
)

In [None]:
nb_of_visits_with_action = cc_logs_by_id_visit.groupby(by=['year', 'month']).agg({
    'has_visit': sum,
    'has_user_done_a_cc_search': sum,
    'has_user_done_a_cc_select': sum,
})

nb_of_visits_with_action['cc_select/cc_search'] = nb_of_visits_with_action.apply(
    lambda x: f"{round(x['has_user_done_a_cc_select']/x['has_user_done_a_cc_search']*100, 2)}%", axis=1
)

nb_of_visits_with_action['cc_select/nb_visites'] = nb_of_visits_with_action.apply(
    lambda x: f"{round(x['has_user_done_a_cc_select']/x['has_visit']*100, 2)}%", axis=1
)

nb_of_visits_with_action

count_number_of_cc_search = cc_logs_by_id_visit['has_user_done_a_cc_search'].sum()
count_number_of_cc_select = cc_logs_by_id_visit['has_user_done_a_cc_select'].sum()

taux_de_completion_cc = round(count_number_of_cc_select / count_number_of_cc_search, 4)*100
print(f'Le taux de complétion de l\'ancien outil Convention Collective est de {taux_de_completion_cc}%')

## Taux de complétion de l'outil convention collective sur le nouveau parcours (après 30.06.2021)

In [None]:
cc_logs_after_by_id_visit = logs_cc_after_30_06_2021.groupby(by=['idVisit']).agg({
        'lastActionDate': 'first',
        'type': lambda x: list(x),
        'timeSpent': lambda x: list(x),
        'url': lambda x: list(x),
    }).reset_index()

cc_logs_after_by_id_visit['month'] = pd.DatetimeIndex(cc_logs_after_by_id_visit['lastActionDate']).month
cc_logs_after_by_id_visit['year'] = pd.DatetimeIndex(cc_logs_after_by_id_visit['lastActionDate']).year

cc_logs_after_by_id_visit

In [None]:
cc_logs_after_by_id_visit = cc_logs_after_by_id_visit.assign(has_visit=True)

cc_logs_after_by_id_visit['has_user_done_a_cc_or_enterprise_search'] = cc_logs_after_by_id_visit['type'].apply(
    lambda types: 'cc_search' in types or 'enterprise_search' in types
)

cc_logs_after_by_id_visit['has_user_done_a_cc_select_px'] = cc_logs_after_by_id_visit['type'].apply(
    lambda types: 'cc_select_p1' in types or 'cc_select_p2' in types
)

In [None]:
def count_times_words_are_present_at_least_one_time_in_list_of_words(
    list_of_list_of_words: List[List[str]], words: List[str]) -> int:
    return len([True for list_of_words in list_of_list_of_words if any(w in words for w in list_of_words)])

In [None]:
nb_of_visits_after_with_action = cc_logs_after_by_id_visit.groupby(by=['year', 'month']).agg({
    'idVisit': 'nunique',
    'has_user_done_a_cc_or_enterprise_search': sum,
    'has_user_done_a_cc_select_px': sum,
}).rename(columns={
    'idVisit': 'has_visit',
    #'nb_cc_search': 'nombre de cc_search',
    #'type_cc_select_px': 'nombre de sélection de CC (nouveau process)',
})

nb_of_visits_after_with_action['cc_select_pX/(cc_search+enterprise)'] = nb_of_visits_after_with_action.apply(
    lambda x: f"{round(x['has_user_done_a_cc_select_px']/x['has_user_done_a_cc_or_enterprise_search']*100, 2)}%", axis=1
)

nb_of_visits_after_with_action['cc_select_pX/nb_visites'] = nb_of_visits_after_with_action.apply(
    lambda x: f"{round(x['has_user_done_a_cc_select_px']/x['has_visit']*100, 2)}%", axis=1
)

nb_of_visits_after_with_action


In [None]:
logs_cc.type.value_counts()

## legacy  graphe

In [None]:
cc_logs_by_id_visit = logs_cc.groupby(by=['idVisit']).agg({
        'lastActionDate': 'first',
        'type': lambda x: list(x),
        'timeSpent': lambda x: list(x),
        'url': lambda x: list(x),
    }).reset_index()
cc_logs_by_id_visit

In [None]:
cc_searches_by_id_visit = cc_logs_by_id_visit[cc_logs_by_id_visit['type'].apply(lambda types: 'cc_search' in types)].copy()
cc_searches_by_id_visit


In [None]:
def count_times_words_are_present_at_least_one_time_in_list_of_words(
    list_of_list_of_words: List[List[str]], words: List[str]) -> int:
    return len([True for list_of_words in list_of_list_of_words if any(w in words for w in list_of_words)])

In [None]:
cc_searches_by_id_visit['type_cc_select_px'] = cc_searches_by_id_visit['type'].copy()

cc_searches_by_date = cc_searches_by_id_visit.groupby(by=['lastActionDate']).agg({
    'idVisit': 'nunique',
    'type': lambda list_of_types: count_times_words_are_present_at_least_one_time_in_list_of_words(list_of_types, ['cc_select']),
    'type_cc_select_px': lambda list_of_types: count_times_words_are_present_at_least_one_time_in_list_of_words(list_of_types, ['cc_select_p1', 'cc_select_p2']),
}).reset_index().rename(columns={
    'idVisit': 'nb_de_recherche_de_cc',
    'type': 'nombre de sélection de CC (ancien process)',
    'type_cc_select_px': 'nombre de sélection de CC (nouveau process)',
})

cc_searches_by_date


In [None]:
cc_logs_by_id_visit['type_cc_select_px'] = cc_logs_by_id_visit['type'] 
cc_logs_by_id_visit['nb_cc_search'] = cc_logs_by_id_visit['type'] 

cc_visites_by_date = cc_logs_by_id_visit.groupby(by=['lastActionDate']).agg({
    'nb_cc_search': lambda list_of_types: count_times_words_are_present_at_least_one_time_in_list_of_words(list_of_types, ['cc_search']),
    'type': lambda list_of_types: count_times_words_are_present_at_least_one_time_in_list_of_words(list_of_types, ['cc_select']),
    'type_cc_select_px': lambda list_of_types: count_times_words_are_present_at_least_one_time_in_list_of_words(list_of_types, ['cc_select_p1', 'cc_select_p2']),
}).reset_index().rename(columns={
    'nb_cc_search': 'nombre de cc_search',
    'type': 'nombre de sélection de CC (ancien process)',
    'type_cc_select_px': 'nombre de sélection de CC (nouveau process)',
})

cc_visites_by_date


In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=cc_searches_by_date['lastActionDate'], 
    y=cc_searches_by_date['nombre de sélection de CC (ancien process)'], 
    mode='lines+markers',
    name='Nb de sélection de CC (ancien)'))
fig.add_trace(go.Scatter(
    x=cc_searches_by_date['lastActionDate'], 
    y=cc_searches_by_date['nombre de sélection de CC (nouveau process)'],
    mode='lines+markers',
    name='Nb de sélection de CC (nv)'))

fig.show()

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=cc_searches_by_date['lastActionDate'], 
    y=cc_searches_by_date['nombre de sélection de CC (ancien process)'] / cc_searches_by_date['nb_de_recherche_de_cc'], 
    mode='lines+markers',
    name='Ratio de sélection de CC (ancien)'))
fig.add_trace(go.Scatter(
    x=cc_searches_by_date['lastActionDate'], 
    y=cc_searches_by_date['nombre de sélection de CC (nouveau process)'] / cc_searches_by_date['nb_de_recherche_de_cc'],
    mode='lines+markers',
    name='Ratio de sélection de CC (nv)'))

fig.show()

## Par semaine

In [None]:
cc_searches_by_date['week_number'] = cc_searches_by_date['lastActionDate'].apply(
    lambda date: date.isocalendar()[1]
)

In [None]:
cc_searches_by_week = cc_searches_by_date.groupby(by=['week_number']).agg({
    'nb_de_recherche_de_cc': 'sum', 
    'nombre de sélection de CC (ancien process)': 'sum', 
    'nombre de sélection de CC (nouveau process)': 'sum',
}).reset_index()

cc_searches_by_week

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=cc_searches_by_week['week_number'], 
    y=cc_searches_by_week['nb_de_recherche_de_cc'],
    mode='lines+markers',
    name='Nb de sélection de CC (nv)'))
fig.show()

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=cc_searches_by_week['week_number'], 
    y=cc_searches_by_week['nombre de sélection de CC (ancien process)'] / cc_searches_by_week['nb_de_recherche_de_cc'], 
    mode='lines+markers',
    name='Ratio de sélection de CC (ancien)'))
fig.add_trace(go.Scatter(
    x=cc_searches_by_week['week_number'], 
    y=cc_searches_by_week['nombre de sélection de CC (nouveau process)'] / cc_searches_by_week['nb_de_recherche_de_cc'],
    mode='lines+markers',
    name='Ratio de sélection de CC (nv)'))

fig.show()