In [None]:
import pandas as pd
import numpy as np
from scipy.stats import spearmanr
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

Read data

In [None]:
df_cpc_com = pd.read_csv('../midsave/cpc_com.csv', dtype={'code_com': str,
                                                   'code_dep': str
                                                         })
df_xwalk = pd.read_csv('../midsave/xwalk.csv', dtype={'code_com': str,
                                                   'code_epci': str,
                                                   'code_dep': str})

Results from gTrend queries

In [None]:
df_gtrends = pd.read_csv('../midsave/intial_anaysis_gtrends.csv', sep=';', dtype={'code_reg': str})

Aggregate CPC to region-level

In [None]:
cp_dep = (df_cpc_com
          .merge(df_xwalk[['code_com', 'code_dep']].drop_duplicates(), on = 'code_com', how = 'left')
           .groupby('code_dep')[['YouTube', 'Web_Adult', 'cpc', 'pop']]
           .sum()
           .reset_index()
          )

In [None]:
cp_dep.head()

In [None]:
cp_dep.code_dep.unique()

Get per 1000 capita estimates

In [None]:
cp_dep['yt_per_1000'] = cp_dep["YouTube"] * 1000 / cp_dep['pop']
cp_dep['wa_per_1000'] = cp_dep["Web_Adult"] * 1000 / cp_dep['pop']
cp_dep["cpc_per_1000"] = cp_dep["cpc"] * 1000 / cp_dep['pop']

Add region names

In [None]:
name_dep = (df_xwalk[['code_dep']]
 .drop_duplicates()
 .reset_index(drop = True)
 .sort_values(by = ['code_dep']))

name_dep['geoName'] = np.nan

In [None]:
name_dep.loc[name_dep.code_dep == '24', 'geoName'] = 'Aquitaine'
name_dep.loc[name_dep.code_dep == '33', 'geoName'] = 'Aquitaine'
name_dep.loc[name_dep.code_dep == '40', 'geoName'] = 'Aquitaine'
name_dep.loc[name_dep.code_dep == '47', 'geoName'] = 'Aquitaine'
name_dep.loc[name_dep.code_dep == '64', 'geoName'] = 'Aquitaine'

name_dep.loc[name_dep.code_dep == '63', 'geoName'] = 'Auvergne'
name_dep.loc[name_dep.code_dep == '15', 'geoName'] = 'Auvergne'
name_dep.loc[name_dep.code_dep == '43', 'geoName'] = 'Auvergne'
name_dep.loc[name_dep.code_dep == '03', 'geoName'] = 'Auvergne'

name_dep.loc[name_dep.code_dep == '22', 'geoName'] = 'Brittany'
name_dep.loc[name_dep.code_dep == '29', 'geoName'] = 'Brittany'
name_dep.loc[name_dep.code_dep == '35', 'geoName'] = 'Brittany'
name_dep.loc[name_dep.code_dep == '56', 'geoName'] = 'Brittany'

name_dep.loc[name_dep.code_dep == '21', 'geoName'] = 'Burgundy'
name_dep.loc[name_dep.code_dep == '58', 'geoName'] = 'Burgundy'
name_dep.loc[name_dep.code_dep == '71', 'geoName'] = 'Burgundy'
name_dep.loc[name_dep.code_dep == '89', 'geoName'] = 'Burgundy'

name_dep.loc[name_dep.code_dep == '18', 'geoName'] = 'Centre-Val de Loire'
name_dep.loc[name_dep.code_dep == '28', 'geoName'] = 'Centre-Val de Loire'
name_dep.loc[name_dep.code_dep == '37', 'geoName'] = 'Centre-Val de Loire'
name_dep.loc[name_dep.code_dep == '41', 'geoName'] = 'Centre-Val de Loire'
name_dep.loc[name_dep.code_dep == '36', 'geoName'] = 'Centre-Val de Loire'
name_dep.loc[name_dep.code_dep == '45', 'geoName'] = 'Centre-Val de Loire'

name_dep.loc[name_dep.code_dep == '08', 'geoName'] = 'Champagne-Ardenne'
name_dep.loc[name_dep.code_dep == '10', 'geoName'] = 'Champagne-Ardenne'
name_dep.loc[name_dep.code_dep == '51', 'geoName'] = 'Champagne-Ardenne'
name_dep.loc[name_dep.code_dep == '52', 'geoName'] = 'Champagne-Ardenne'

name_dep.loc[name_dep.code_dep == '25', 'geoName'] = 'Franche-Comté'
name_dep.loc[name_dep.code_dep == '39', 'geoName'] = 'Franche-Comté'
name_dep.loc[name_dep.code_dep == '70', 'geoName'] = 'Franche-Comté'
name_dep.loc[name_dep.code_dep == '90', 'geoName'] = 'Franche-Comté'

name_dep.loc[name_dep.code_dep == '11', 'geoName'] = 'Languedoc-Roussillon'
name_dep.loc[name_dep.code_dep == '30', 'geoName'] = 'Languedoc-Roussillon'
name_dep.loc[name_dep.code_dep == '34', 'geoName'] = 'Languedoc-Roussillon'
name_dep.loc[name_dep.code_dep == '66', 'geoName'] = 'Languedoc-Roussillon'

name_dep.loc[name_dep.code_dep == '19', 'geoName'] = 'Limousin'
name_dep.loc[name_dep.code_dep == '23', 'geoName'] = 'Limousin'
name_dep.loc[name_dep.code_dep == '87', 'geoName'] = 'Limousin'

name_dep.loc[name_dep.code_dep == '54', 'geoName'] = 'Lorraine'
name_dep.loc[name_dep.code_dep == '55', 'geoName'] = 'Lorraine'
name_dep.loc[name_dep.code_dep == '57', 'geoName'] = 'Lorraine'
name_dep.loc[name_dep.code_dep == '88', 'geoName'] = 'Lorraine'

name_dep.loc[name_dep.code_dep == '14', 'geoName'] = 'Lower Normandy'
name_dep.loc[name_dep.code_dep == '50', 'geoName'] = 'Lower Normandy'
name_dep.loc[name_dep.code_dep == '61', 'geoName'] = 'Lower Normandy'

name_dep.loc[name_dep.code_dep == '09', 'geoName'] = 'Midi-Pyrénées'
name_dep.loc[name_dep.code_dep == '12', 'geoName'] = 'Midi-Pyrénées'
name_dep.loc[name_dep.code_dep == '31', 'geoName'] = 'Midi-Pyrénées'
name_dep.loc[name_dep.code_dep == '32', 'geoName'] = 'Midi-Pyrénées'
name_dep.loc[name_dep.code_dep == '46', 'geoName'] = 'Midi-Pyrénées'
name_dep.loc[name_dep.code_dep == '65', 'geoName'] = 'Midi-Pyrénées'
name_dep.loc[name_dep.code_dep == '81', 'geoName'] = 'Midi-Pyrénées'
name_dep.loc[name_dep.code_dep == '82', 'geoName'] = 'Midi-Pyrénées'

name_dep.loc[name_dep.code_dep == '59', 'geoName'] = 'Nord-Pas-de-Calais'
name_dep.loc[name_dep.code_dep == '62', 'geoName'] = 'Nord-Pas-de-Calais'

name_dep.loc[name_dep.code_dep == '44', 'geoName'] = 'Pays de la Loire'
name_dep.loc[name_dep.code_dep == '49', 'geoName'] = 'Pays de la Loire'
name_dep.loc[name_dep.code_dep == '53', 'geoName'] = 'Pays de la Loire'
name_dep.loc[name_dep.code_dep == '72', 'geoName'] = 'Pays de la Loire'
name_dep.loc[name_dep.code_dep == '85', 'geoName'] = 'Pays de la Loire'

name_dep.loc[name_dep.code_dep == '02', 'geoName'] = 'Picardy'
name_dep.loc[name_dep.code_dep == '60', 'geoName'] = 'Picardy'
name_dep.loc[name_dep.code_dep == '80', 'geoName'] = 'Picardy'

name_dep.loc[name_dep.code_dep == '16', 'geoName'] = 'Poitou-Charentes'
name_dep.loc[name_dep.code_dep == '17', 'geoName'] = 'Poitou-Charentes'
name_dep.loc[name_dep.code_dep == '79', 'geoName'] = 'Poitou-Charentes'
name_dep.loc[name_dep.code_dep == '86', 'geoName'] = 'Poitou-Charentes'

name_dep.loc[name_dep.code_dep == '04', 'geoName'] = "Provence-Alpes-Côte d'Azur"
name_dep.loc[name_dep.code_dep == '05', 'geoName'] = "Provence-Alpes-Côte d'Azur"
name_dep.loc[name_dep.code_dep == '06', 'geoName'] = "Provence-Alpes-Côte d'Azur"
name_dep.loc[name_dep.code_dep == '13', 'geoName'] = "Provence-Alpes-Côte d'Azur"
name_dep.loc[name_dep.code_dep == '83', 'geoName'] = "Provence-Alpes-Côte d'Azur"
name_dep.loc[name_dep.code_dep == '84', 'geoName'] = "Provence-Alpes-Côte d'Azur"

name_dep.loc[name_dep.code_dep == '01', 'geoName'] = 'Rhone-Alpes'
name_dep.loc[name_dep.code_dep == '07', 'geoName'] = 'Rhone-Alpes'
name_dep.loc[name_dep.code_dep == '26', 'geoName'] = 'Rhone-Alpes'
name_dep.loc[name_dep.code_dep == '38', 'geoName'] = 'Rhone-Alpes'
name_dep.loc[name_dep.code_dep == '42', 'geoName'] = 'Rhone-Alpes'
name_dep.loc[name_dep.code_dep == '69', 'geoName'] = 'Rhone-Alpes'
name_dep.loc[name_dep.code_dep == '73', 'geoName'] = 'Rhone-Alpes'
name_dep.loc[name_dep.code_dep == '74', 'geoName'] = 'Rhone-Alpes'

name_dep.loc[name_dep.code_dep == '27', 'geoName'] = 'Upper Normandy'
name_dep.loc[name_dep.code_dep == '76', 'geoName'] = 'Upper Normandy'

name_dep.loc[name_dep.code_dep == '91', 'geoName'] = 'Île-de-France'
name_dep.loc[name_dep.code_dep == '92', 'geoName'] = 'Île-de-France'
name_dep.loc[name_dep.code_dep == '93', 'geoName'] = 'Île-de-France'
name_dep.loc[name_dep.code_dep == '94', 'geoName'] = 'Île-de-France'
name_dep.loc[name_dep.code_dep == '95', 'geoName'] = 'Île-de-France'
name_dep.loc[name_dep.code_dep == '77', 'geoName'] = 'Île-de-France'
name_dep.loc[name_dep.code_dep == '78', 'geoName'] = 'Île-de-France'
name_dep.loc[name_dep.code_dep == '75', 'geoName'] = 'Île-de-France'

name_dep.loc[name_dep.code_dep == '67', 'geoName'] = 'Alsace'
name_dep.loc[name_dep.code_dep == '68', 'geoName'] = 'Alsace'

In [None]:
cp_dep = cp_dep.merge(name_dep, on = 'code_dep', how = 'left').drop(columns = ['pop'])

In [None]:
df_gtrends= (df_gtrends
             .drop(columns = ['code_reg', 'phtc', 'psthc', 'porno jeune ado_y', 'video porno ado_y', 'sexe mineur',
                             'sex mineur'])
            .rename(columns = {'video porno ado_x' : 'video porno ado', 'porno jeune ado_x' : 'porno jeune ado'}))

In [None]:
df_gtrends.shape

Delete all-zero columns

In [None]:
df_gtrends = df_gtrends.loc[:, (df_gtrends != 0).any(axis=0)]

Get principal components

In [None]:
n_components = 3
pca = PCA(n_components = n_components)

In [None]:
X = StandardScaler().fit_transform(df_gtrends.drop(columns = ['geoName']))
pca.fit(X)

Get some insights about a good choice of the number of components

In [None]:
explained_variance = pca.explained_variance_ratio_
cumulative_explained_variance = explained_variance.cumsum()

plt.plot(range(1, len(cumulative_explained_variance) + 1), cumulative_explained_variance, marker='o', linestyle='--')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Explained Variance vs. Number of Components')
plt.grid(True)
plt.show()

Create a visualization of the composition of the components

In [None]:
loadings = pca.components_

plt.figure(figsize=(8, 2))
sns.heatmap(loadings, annot=False, cmap='coolwarm', xticklabels=df_gtrends.drop(columns = ['geoName']).columns, yticklabels=[f'PC{i+1}' for i in range(n_components)])
plt.savefig('../viz/pca.png', dpi=300, bbox_inches="tight")
plt.show()



Extract principal components as features

In [None]:
principalComponents = pca.fit_transform(X)
pca_df = pd.DataFrame(data=principalComponents, columns=[f'PC{i}' for i in range(1, n_components + 1)])

In [None]:
df_gtrends = pd.concat([df_gtrends, pca_df], axis=1)

Create linear indices

In [None]:
df_gtrends['lin_1'] = (df_gtrends['pedoporno'] + df_gtrends['porno mineur'] + 
                       df_gtrends['porno enfant'])/(3*100)

df_gtrends['lin_2'] = (df_gtrends['boylove'] + 
                       df_gtrends['porno jeune ado'] + df_gtrends['video porno ado'] + 
                       df_gtrends['ado porno'] + 
                       df_gtrends['porno jeune fille'] + df_gtrends['omegle'])/(6*100)

df_gtrends['lin_3'] = (df_gtrends['pedoporno'] + df_gtrends['porno mineur'] + 
                       df_gtrends['porno enfant'] + df_gtrends['hurtcore'] + 
                       df_gtrends['boylove'] + df_gtrends['porno jeune ado'] + df_gtrends['video porno ado'] + 
                       df_gtrends['ado porno'] + df_gtrends['porno jeune fille'] + 
                       df_gtrends['omegle'] + df_gtrends['pthc'])/(11*100)

In [None]:
df_gtrends.columns

In [None]:
cp_dep = cp_dep.merge(df_gtrends, on = 'geoName', how = 'left')

In [None]:
cp_dep.head()

In [None]:
cp_dep.describe()

Save and get correlations

In [None]:
cp_dep.to_csv('../midsave/gtrends_dep.csv')