# Analyse des clicks sur les articles liés

**Définition** : On veut récupérer le nombre de clics sur les articles liés par page et le nombre de visiste sur la page pour faire un ratio de clics


## On commence par le chargement des données

In [None]:
import pandas as pd
import time
import json
import math
from tqdm.notebook import tqdm
import plotly.express as px
import plotly.graph_objects as go
from datetime import datetime
from src.elasticsearch_connector import ElasticsearchConnector
#from typing import List, Dict


pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

## Si on veut lire en requêtant Elastic Search

In [None]:
es_connector = ElasticsearchConnector(env='monolog')

QUERY = {
  "query": {
    "bool": { 
      "must": [
        {
          "term": {
            "type": "select_related" 
          }
        },
        {
          "range": {
           "logfile": {
              "gte": "2023-01-01",
              "lt": "2023-06-18"
           }
          }
        }
      #  {
      #    "range": {
      #     "lastActionDateTime": {
      #        "gte": "2022-04-01 00:00:00",
      #        "lt": "2022-05-01 00:00:00"
      #     }
      #    }
      #  },
      ]
    }
  }
}

In [None]:
logs_selected_related = es_connector.execute_query(QUERY, "logs-new")

# On va récupérer le nombre de clic par URL

### Première étape

In [None]:
clickByUrl = logs_selected_related.groupby(['url'])[['idVisit']]
clickByUrlCount = clickByUrl.count()
# clickByUrlCount.rename(columns={"idVisit": "Nombre de clics"})

In [None]:
clickByUrlCount.sort_values(by=['idVisit'], ascending=False)

# On va récupérer le nombre de visite sur la même période

In [None]:
QUERY_VISITS = {
  "query": {
    "bool": { 
      "must": [
        {
          "term": {
            "type": "visit_content" 
          }
        },
        {
          "range": {
           "logfile": {
              "gte": "2023-01-01",
              "lt": "2023-06-18"
           }
          }
        }
      #  {
      #    "range": {
      #     "lastActionDateTime": {
      #        "gte": "2022-04-01 00:00:00",
      #        "lt": "2022-05-01 00:00:00"
      #     }
      #    }
      #  },
      ]
    }
  }
}

In [None]:
logs_visits = es_connector.execute_query(QUERY_VISITS, "logs-new")

In [None]:
visitByUrl = logs_visits.groupby(['url'])[['idVisit']]

In [None]:
visitByUrlCount = visitByUrl.count()
visitByUrlCount.sort_values(by=['idVisit'], ascending=False)

# Merge des deux tableaux pour avoir le nombre de clic et de visit

In [None]:
clicAndVisitByUrl = clickByUrlCount.merge(visitByUrlCount, left_on='url', right_on='url', how='outer')

In [None]:
clicAndVisitByUrl.rename(columns={"idVisit_x": "Clicks", "idVisit_y": "Visites"})

In [None]:
dataWithPercent = clicAndVisitByUrl.assign(Percent=lambda x: x.idVisit_x * 100 / x.idVisit_y).dropna()

In [None]:
dataWithPercent.sort_values(by=['Percent'], ascending=False)

In [None]:
dataWithPercent.sort_values(by=['Percent'], ascending=False).to_csv('out.csv', index=True)