In [1]:
# Imports 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os


# Directories
# Stage Dir
stage_1_dir = "stage-1"

# Code Dir
code_dir = "code"

# Data Dir
data_dir = "data"

# Filenames
churn_od_txt = "Churn_OD.txt"
churn_users_pf_txt = "Churn_UsersProfile.txt"

# "freguesias-metadata.xlsx"
# Source: https://dados.gov.pt/pt/datasets/freguesias-de-portugal/
pt_parish_metadata = "freguesias-metadata.xlsx"

# "DistritosConcelhosFreguesias_CAOP2013_Populacao_Censos2011.xls"
# Source: https://www.factorvirtual.com/blog/distritos-concelhos-e-freguesias-de-portugal
dicofre_den_pop = "DistritosConcelhosFreguesias_CAOP2013_Populacao_Censos2011.xls"

# "pordata_densidade_populacional.xlsx"
# Source: https://www.pordata.pt/Subtema/Municipios/Censos+da+População-204
pordata_dens_pop = "pordata_densidade_populacional.xlsx"

In [2]:
# Load Churn OD data
churn_od = pd.read_csv(
    os.path.join(stage_1_dir, data_dir, churn_od_txt),
    delimiter = "|",
    encoding= 'unicode_escape'
)

# Get churn_od head for debugging purposes
churn_od.head()

FileNotFoundError: [Errno 2] No such file or directory: 'stage-1/data/Churn_OD.txt'

In [None]:
# Load Churn Users Profile data
churn_users_pf = np.genfromtxt(
    fname=os.path.join(stage_1_dir, data_dir, churn_users_pf_txt),
    delimiter='|',
    dtype=str
)

churn_users_pf = pd.DataFrame(data=churn_users_pf[1::, :], columns=churn_users_pf[0, :])
print(churn_users_pf.shape)
churn_users_pf.head()

In [None]:
# Load Parish Metadata
parish_metadata = pd.read_excel(os.path.join(stage_1_dir, data_dir, pt_parish_metadata))
print(parish_metadata.shape)
parish_metadata.head()

In [None]:
# Also get this population density mapping per county of origin
counties_of_origin = churn_users_pf['County_of_Origin'].copy()
counties_of_origin = counties_of_origin.drop_duplicates()
counties_of_origin = np.array(counties_of_origin.values, dtype=str)
print(f"Counties of Origin:\n{counties_of_origin}")
print(f"Shape of this array:\n{counties_of_origin.shape}")

# Create a dictionary that maps indices and counties' names
counties_of_origin_dict = dict()

for index, name in enumerate(counties_of_origin):
    counties_of_origin_dict[name] = index

print(f"Dictionary of Indices and Names:\n{counties_of_origin_dict}")

In [None]:
# TODO: Get the periods as well!
# Compute population density per county of origin
pop_dens_orig_cnty = np.zeros(shape=(counties_of_origin.shape[0], 3), dtype=object)

# Populate first column
for name, index in counties_of_origin_dict.items():
    pop_dens_orig_cnty[index, 0] = name

print(f"Column Names filled:\n{pop_dens_orig_cnty}")

# Go through the dataframe
for index, row in churn_users_pf.iterrows():
    pop_dens_orig_cnty[counties_of_origin_dict[row['County_of_Origin']], 1] += np.float(row['Average_BusUsers_per_Day'])
    

print(f"Countis of Origin and their Population Densities:\n{pop_dens_orig_cnty}")