In [1]:
# Imports 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os


# Directories
# Stage Dir
stage_1_dir = "stage-1"

# Code Dir
code_dir = "code"

# Data Dir
data_dir = "data"

# Filenames
churn_od_txt = "Churn_OD.txt"
churn_users_pf_txt = "Churn_UsersProfile.txt"

# "freguesias-metadata.xlsx"
# Source: https://dados.gov.pt/pt/datasets/freguesias-de-portugal/
pt_parish_metadata = "freguesias-metadata.xlsx"

# "DistritosConcelhosFreguesias_CAOP2013_Populacao_Censos2011.xls"
# Source: https://www.factorvirtual.com/blog/distritos-concelhos-e-freguesias-de-portugal
dicofre_den_pop = "DistritosConcelhosFreguesias_CAOP2013_Populacao_Censos2011.xls"

# "pordata_densidade_populacional.xlsx"
# Source: https://www.pordata.pt/Subtema/Municipios/Censos+da+População-204
pordata_dens_pop = "pordata_densidade_populacional.xlsx"

In [2]:
# Load Churn OD data
churn_od = pd.read_csv(
    filepath_or_buffer = os.path.join(stage_1_dir, data_dir, churn_od_txt),
    delimiter = "|",
    encoding = 'unicode_escape'
)

# Get churn_od head for debugging purposes
churn_od.head()

Unnamed: 0,Region_of_Origin,District_of_Origin,County_of_Origin,Region_of_Public_Transportation,District_of_Public_Transportation,County_of_Public_Transportation,Dicofre_ParishCode_of_Public_Transportation,Demand_weight
0,R1 - AM Lisboa,Lisboa,Amadora,R1 - AM Lisboa,LISBOA,LISBOA,110608,0.307323
1,R1 - AM Lisboa,Lisboa,Amadora,R1 - AM Lisboa,LISBOA,LISBOA,110639,0.069997
2,R1 - AM Lisboa,Lisboa,Amadora,R1 - AM Lisboa,LISBOA,LISBOA,110658,0.066059
3,R1 - AM Lisboa,Lisboa,Amadora,R1 - AM Lisboa,LISBOA,LISBOA,110654,0.059847
4,R1 - AM Lisboa,Lisboa,Amadora,R1 - AM Lisboa,LISBOA,LISBOA,110633,0.052341


In [3]:
# Load Churn Users Profile data
churn_users_pf = pd.read_csv(
    filepath_or_buffer = os.path.join(stage_1_dir, data_dir, churn_users_pf_txt),
    delimiter = '|',
    encoding = 'unicode_escape'
)


# Debugging purposes
churn_users_pf.head()

Unnamed: 0,Region_of_Origin,District_of_Origin,County_of_Origin,Period,GenderDescription,AgeClassDescription,Average_BusUsers_per_Day
0,R1 - AM Lisboa,Lisboa,Amadora,Sep-19 to Feb-20,Female,15-24,294.194206
1,R1 - AM Lisboa,Lisboa,Amadora,Sep-19 to Feb-20,Female,25-34,1081.652817
2,R1 - AM Lisboa,Lisboa,Amadora,Sep-19 to Feb-20,Female,35-44,235.836653
3,R1 - AM Lisboa,Lisboa,Amadora,Sep-19 to Feb-20,Female,45-54,840.951323
4,R1 - AM Lisboa,Lisboa,Amadora,Sep-19 to Feb-20,Female,55-64,1216.148092


In [4]:
# Load Parish Metadata
parish_metadata = pd.read_excel(os.path.join(stage_1_dir, data_dir, pt_parish_metadata), encoding = 'unicode_escape')
parish_metadata = parish_metadata[["distrito", "concelho", "freguesia", "dicofre"]]

# Debugging purposes
print(parish_metadata.shape)
parish_metadata.head()

(3092, 4)


Unnamed: 0,distrito,concelho,freguesia,dicofre
0,Aveiro,Águeda,Aguada de Cima,10103
1,Aveiro,Águeda,Fermentelos,10109
2,Aveiro,Águeda,Macinhata do Vouga,10112
3,Aveiro,Águeda,Valongo do Vouga,10119
4,Aveiro,Águeda,União das freguesias de Águeda e Borralha,10121


In [5]:
# Load Population Density
population_density_censos = pd.read_excel(os.path.join(stage_1_dir, data_dir, dicofre_den_pop), encoding = 'unicode_escape')

population_density_censos.rename(columns={"Freguesia (FR)":"dicofre", "PopRes_2011 (nº)":"População"}, inplace=True)
population_density_censos["dicofre"].str.strip()

# Debugging purposes
print(population_density_censos.shape)
population_density_censos.head()

(2882, 9)


Unnamed: 0,Distrito (DT),Designação DT,Concelho (CC),Designação CC,dicofre,Designação FR,População,Rural,Litorâneo
0,1,Aveiro,101,Águeda,10103,Aguada de Cima,4013,S,
1,1,Aveiro,101,Águeda,10109,Fermentelos,3258,S,
2,1,Aveiro,101,Águeda,10112,Macinhata do Vouga,3406,S,
3,1,Aveiro,101,Águeda,10119,Valongo do Vouga,4877,S,
4,1,Aveiro,101,Águeda,10121,União das freguesias de Águeda e Borralha,13576,S,


In [6]:
# Join parish metadata and this by dicofre
population_density_censos_dicofres = population_density_censos.merge(parish_metadata, on=["dicofre"], how="left")
population_density_censos_dicofres = population_density_censos_dicofres[["Designação DT", "Designação CC", "dicofre", "Designação FR", "População"]]
population_density_censos_dicofres.rename(columns={"Designação DT":"Distrito", "Designação CC":"Concelho", "Designação FR":"Freguesia"}, inplace=True)
population_density_censos_dicofres.head()

Unnamed: 0,Distrito,Concelho,dicofre,Freguesia,População
0,Aveiro,Águeda,10103,Aguada de Cima,4013
1,Aveiro,Águeda,10109,Fermentelos,3258
2,Aveiro,Águeda,10112,Macinhata do Vouga,3406
3,Aveiro,Águeda,10119,Valongo do Vouga,4877
4,Aveiro,Águeda,10121,União das freguesias de Águeda e Borralha,13576


In [7]:
# We only want the district, county and parish and the codes, along with the population density
pop_density = population_density_censos_dicofres.copy()

# Debugging purposes
pop_density.head()

Unnamed: 0,Distrito,Concelho,dicofre,Freguesia,População
0,Aveiro,Águeda,10103,Aguada de Cima,4013
1,Aveiro,Águeda,10109,Fermentelos,3258
2,Aveiro,Águeda,10112,Macinhata do Vouga,3406
3,Aveiro,Águeda,10119,Valongo do Vouga,4877
4,Aveiro,Águeda,10121,União das freguesias de Águeda e Borralha,13576


In [8]:
# Population density by district and county
# District
pop_density_district = pop_density.copy().groupby("Distrito").sum()

# Debugging purposes
pop_density_district.head()

Unnamed: 0_level_0,População
Distrito,Unnamed: 1_level_1
Aveiro,714197
Beja,152758
Braga,848185
Bragança,136252
Castelo Branco,196264


In [9]:
# County
pop_density_county = pop_density.copy().groupby(["Distrito", "Concelho"]).sum()

# Debugging purposes
pop_density_county.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,População
Distrito,Concelho,Unnamed: 2_level_1
Aveiro,Albergaria-a-Velha,25252
Aveiro,Anadia,29150
Aveiro,Arouca,22359
Aveiro,Aveiro,78450
Aveiro,Castelo de Paiva,16733


In [10]:
# Rename columns to help in the join
population_density_censos_dicofres_renamed = population_density_censos_dicofres.copy()
population_density_censos_dicofres_renamed.rename(columns = {'dicofre': 'Dicofre_ParishCode_of_Public_Transportation'}, inplace=True)
population_density_censos_dicofres_renamed.head()

Unnamed: 0,Distrito,Concelho,Dicofre_ParishCode_of_Public_Transportation,Freguesia,População
0,Aveiro,Águeda,10103,Aguada de Cima,4013
1,Aveiro,Águeda,10109,Fermentelos,3258
2,Aveiro,Águeda,10112,Macinhata do Vouga,3406
3,Aveiro,Águeda,10119,Valongo do Vouga,4877
4,Aveiro,Águeda,10121,União das freguesias de Águeda e Borralha,13576


In [11]:
# Merge churn_od with pop_density_censos_renamed
churn_od_pop_density = churn_od.merge(
    population_density_censos_dicofres_renamed,
    on=["Dicofre_ParishCode_of_Public_Transportation"],
    how="left"
)

# Debugging
print(churn_od_pop_density.shape)
churn_od_pop_density.head()

(2253, 12)


Unnamed: 0,Region_of_Origin,District_of_Origin,County_of_Origin,Region_of_Public_Transportation,District_of_Public_Transportation,County_of_Public_Transportation,Dicofre_ParishCode_of_Public_Transportation,Demand_weight,Distrito,Concelho,Freguesia,População
0,R1 - AM Lisboa,Lisboa,Amadora,R1 - AM Lisboa,LISBOA,LISBOA,110608,0.307323,Lisboa,Lisboa,Benfica,36985
1,R1 - AM Lisboa,Lisboa,Amadora,R1 - AM Lisboa,LISBOA,LISBOA,110639,0.069997,Lisboa,Lisboa,São Domingos de Benfica,33043
2,R1 - AM Lisboa,Lisboa,Amadora,R1 - AM Lisboa,LISBOA,LISBOA,110658,0.066059,Lisboa,Lisboa,Belém,16528
3,R1 - AM Lisboa,Lisboa,Amadora,R1 - AM Lisboa,LISBOA,LISBOA,110654,0.059847,Lisboa,Lisboa,Alvalade,31813
4,R1 - AM Lisboa,Lisboa,Amadora,R1 - AM Lisboa,LISBOA,LISBOA,110633,0.052341,Lisboa,Lisboa,Olivais,33788


In [15]:
# Get AvgUsersPerDay by District and County of Origin
churn_users_grouped  = churn_users_pf.copy()[["Region_of_Origin", "District_of_Origin", "County_of_Origin", "Period", "Average_BusUsers_per_Day"]]
churn_users_grouped = churn_users_grouped.groupby(["Region_of_Origin", "District_of_Origin", "County_of_Origin", "Period"]).sum() 
churn_users_grouped.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Average_BusUsers_per_Day
Region_of_Origin,District_of_Origin,County_of_Origin,Period,Unnamed: 4_level_1
R1 - AM Lisboa,Lisboa,Amadora,Sep-19 to Feb-20,8474.874368
R1 - AM Lisboa,Lisboa,Amadora,Sep-20 to Jan-21,6985.896633
R1 - AM Lisboa,Lisboa,Cascais,Sep-19 to Feb-20,13023.810109
R1 - AM Lisboa,Lisboa,Cascais,Sep-20 to Jan-21,6693.154115
R1 - AM Lisboa,Lisboa,Lisboa,Sep-19 to Feb-20,27874.289765


In [16]:
# Get AvgUsersPerDay by District of Origin
churn_users_grouped_district  = churn_users_pf.copy()[["Region_of_Origin", "District_of_Origin", "Period", "Average_BusUsers_per_Day"]]
churn_users_grouped_district = churn_users_grouped.groupby(["Region_of_Origin", "District_of_Origin", "Period"]).sum() 
churn_users_grouped_district.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Average_BusUsers_per_Day
Region_of_Origin,District_of_Origin,Period,Unnamed: 3_level_1
R1 - AM Lisboa,Lisboa,Sep-19 to Feb-20,122085.933172
R1 - AM Lisboa,Lisboa,Sep-20 to Jan-21,81057.911693
R1 - AM Lisboa,SetÃºbal,Sep-19 to Feb-20,41852.012153
R1 - AM Lisboa,SetÃºbal,Sep-20 to Jan-21,33726.760091
R2 - AM Porto,Aveiro,Sep-19 to Feb-20,6041.956609
