In [14]:
from google.oauth2.credentials import Credentials
import io, json, gspread, gspread_dataframe
import pandas as pd
from tqdm import tqdm
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from googleapiclient.discovery import Resource
import requests
import numpy as np
import math, time
import settings_notebook as settings

## Get Credentials for accessing Google Drive API

In [15]:
def load_credentials_from_gdrive(token_file_id:str, scopes:list):
    def download_file_from_google_drive(id):
        URL = "https://docs.google.com/uc?export=download"
        session = requests.Session()
        response = session.get(URL, params={'id': id}, stream=True)
        return response.content

    downloaded_file = io.BytesIO(download_file_from_google_drive(token_file_id))
    token_info_json = json.load(downloaded_file)
    creds = Credentials.from_authorized_user_info(token_info_json, scopes=scopes)

    return creds

# CALL FUNCTION
creds = load_credentials_from_gdrive(settings.TOKEN_FILE_ID, settings.SCOPES)
creds

<google.oauth2.credentials.Credentials at 0x7ffb6e3e8a50>

## Cleaned Data -> Converted Data

In [17]:
import json
def get_cleaned_data():
  def download_file_from_google_drive(id):
      URL = "https://docs.google.com/uc?export=download"
      session = requests.Session()
      response = session.get(URL, params={'id': id}, stream=True)
      return response.content
  cleaned_data_dict:dict = json.loads(download_file_from_google_drive(METADATA_CLEANED_DATA))
  return cleaned_data_dict

cleaned_data_dict = get_cleaned_data()

In [18]:
def convert_cleaned_data_to_df(cleaned_data:dict):
  read_spreadsheet = lambda id, gid : f"https://docs.google.com/spreadsheets/d/{id}/export?gid={gid}&format=csv"
  
  converted_column = [
                      # "ID.PROVINCE", 
                      "Province",
                      "Indicator ID", 
                      # "Indicator Code", 
                      "2018", "2019", "2020", "2021", "2022", "2023"]
  df_list = []
  for sps_id, sps_data in tqdm(list(cleaned_data.items()), desc="Processing Google Sheets"):
    worksheet_id = sps_data.get('worksheet_id')
    df = pd.read_csv(read_spreadsheet(sps_id, worksheet_id))
    df = df[converted_column]
    if list(df.columns) != converted_column:
      raise Exception(f"Invalid Column Structure from {sps_data["sheet_name"]}")
    df_list.append(df)
  print(f"{len(df_list)} DATA CONVERTED TO PANDAS DATAFRAME")
  
  
  return df_list

# CALL FUNCTION
cleaned_data_dataframes:list[pd.DataFrame] = convert_cleaned_data_to_df(cleaned_data_dict)

Processing Google Sheets: 100%|██████████| 94/94 [02:14<00:00,  1.43s/it]

94 DATA CONVERTED TO PANDAS DATAFRAME





In [19]:
merged_cleaned_data:pd.DataFrame = pd.concat(cleaned_data_dataframes)
print(f"{len(merged_cleaned_data.Province.unique())} UNIQUE PROVINCE DETECTED")
print(f"{len(merged_cleaned_data["Indicator ID"].unique())} UNIQUE INDICATOR DETECTED")
print(f"{len(merged_cleaned_data[~merged_cleaned_data["2018"].isnull()])} NON-EMPTY VALUE DATA FROM 2018")
print(f"{len(merged_cleaned_data[~merged_cleaned_data["2019"].isnull()])} NON-EMPTY VALUE DATA FROM 2019")
print(f"{len(merged_cleaned_data[~merged_cleaned_data["2020"].isnull()])} NON-EMPTY VALUE DATA FROM 2020")
print(f"{len(merged_cleaned_data[~merged_cleaned_data["2021"].isnull()])} NON-EMPTY VALUE DATA FROM 2021")
print(f"{len(merged_cleaned_data[~merged_cleaned_data["2022"].isnull()])} NON-EMPTY VALUE DATA FROM 2022")
print(f"{len(merged_cleaned_data[~merged_cleaned_data["2023"].isnull()])} NON-EMPTY VALUE DATA FROM 2023")

35 UNIQUE PROVINCE DETECTED
89 UNIQUE INDICATOR DETECTED
203 NON-EMPTY VALUE DATA FROM 2018
404 NON-EMPTY VALUE DATA FROM 2019
2510 NON-EMPTY VALUE DATA FROM 2020
2556 NON-EMPTY VALUE DATA FROM 2021
2048 NON-EMPTY VALUE DATA FROM 2022
2542 NON-EMPTY VALUE DATA FROM 2023


In [20]:
 
melted_cleaned_data:pd.DataFrame = pd.melt(merged_cleaned_data, 
                                           id_vars=[
                                                   #  "ID.PROVINCE", 
                                                   "Province", 
                                                   "Indicator ID"], 
                                           var_name="Year", 
                                           value_name="Value").sort_values(by=[
                                                                               "Province", 
                                                                               "Indicator ID", 
                                                                               "Year"]).rename(columns={"Province":"Area",
                                                                                                        "Indicator ID":"Indicator Code"})
                                           
print(f"MELTING DATA BY YEAR FROM {len(merged_cleaned_data)} DATA to {len(melted_cleaned_data)} DATA")

MELTING DATA BY YEAR FROM 3278 DATA to 19668 DATA


In [21]:
def get_master_data(creds, master_worksheet:str): # (ID, GID)
  
  client = gspread.authorize(creds)
  
  master_area_client = client.open_by_key(settings.MASTER_AREA_SPSID)
  master_area_worksheet = master_area_client.worksheet(master_worksheet)
  try:
    master_area_df = gspread_dataframe.get_as_dataframe(worksheet=master_area_worksheet, )
  except HttpError:
    print("Quota Exceeded | Wait a minute . . .")
    master_area_df = gspread_dataframe.get_as_dataframe(worksheet=master_area_worksheet, )
  master_area_df["ID"] = master_area_df["ID"].astype("int64").astype(str).apply(lambda x : x.zfill(2))
  
  master_inc_province_client = client.open_by_key(settings.MASTER_INCOME_PROVINCE_SPSID)
  master_inc_province_worksheet = master_inc_province_client.worksheet(master_worksheet)
  try:
    master_inc_province_df = gspread_dataframe.get_as_dataframe(worksheet=master_inc_province_worksheet)
  except HttpError:
    print("Quota Exceeded | Wait a minute . . .")
    master_inc_province_df = gspread_dataframe.get_as_dataframe(worksheet=master_inc_province_worksheet)
    
  master_year_client = client.open_by_key(settings.MASTER_YEAR_SPSID)
  master_year_worksheet = master_year_client.worksheet(master_worksheet)
  master_year_df = gspread_dataframe.get_as_dataframe(worksheet=master_year_worksheet)
  master_year_df.Year = master_year_df.Year.astype(int).astype(str)
  
  master_indicator_client = client.open_by_key(settings.MASTER_INDICATOR_SPSID)
  master_indicator_worksheet = master_indicator_client.worksheet(master_worksheet)
  master_indicator_df = gspread_dataframe.get_as_dataframe(worksheet=master_indicator_worksheet, )
  
  
  return master_area_df, master_inc_province_df, master_year_df, master_indicator_df

# CALL FUNCTION
master_area_df, master_inc_province_df, master_year_df, master_indicator_df = get_master_data(creds, settings.MASTER_WORKSHEET)
print(f"ROWS OF AREA MASTER DATA : {len(master_area_df)}")
print(f"ROWS OF PROVINCE INCOME MASTER DATA : {len(master_inc_province_df)}")
print(f"ROWS OF YEAR MASTER DATA : {len(master_year_df)}")
print(f"ROWS OF INDICATOR MASTER DATA : {len(master_indicator_df)}")

ROWS OF AREA MASTER DATA : 35
ROWS OF PROVINCE INCOME MASTER DATA : 34
ROWS OF YEAR MASTER DATA : 6
ROWS OF INDICATOR MASTER DATA : 199


In [22]:
def cross_merge_master(master_area:pd.DataFrame,
                       master_year:pd.DataFrame, 
                       master_indicator:pd.DataFrame, 
                       column_to_rename:dict):

    province = master_area[["ID", "AREA_NAME"]].drop_duplicates()
    indicator = master_indicator[["Indicator_Code","Indicator_Name", "Unit"]].drop_duplicates()
    year = master_year.Year.astype(str).drop_duplicates()
    
    print(f"FOUND {len(province)} PROVINCE DATA")
    print(f"FOUND {len(indicator)} INDICATOR DATA")
    print(f"FOUND {len(year)} YEAR DATA")
    
    return province.merge(indicator, how='cross').merge(year, how='cross').rename(columns=column_to_rename)

# CALL FUNCTION
cross_merged_master = cross_merge_master(master_area_df, 
                                         master_year_df, 
                                         master_indicator_df, 
                                         {
                                            "ID":"Area Code",
                                            "Indicator_Code":"Indicator Code",
                                            "AREA_NAME":"Area",
                                            "Indicator_Name":"Indicator Name"
                                          })


FOUND 35 PROVINCE DATA
FOUND 199 INDICATOR DATA
FOUND 6 YEAR DATA


In [24]:
# VALIDATING CLEANED DATA BASED ON MASTER DATA
def validate_data_1():
    print("Ensuring cleaned data is a subset of master data . . .")
    ensure_province:bool = set(melted_cleaned_data.Area).issubset(cross_merged_master.Area) 
    ensure_year:bool = set(melted_cleaned_data.Year).issubset(cross_merged_master.Year) 
    ensure_indicator:bool = set(melted_cleaned_data["Indicator Code"]).issubset(cross_merged_master["Indicator Code"])
    if not (ensure_province and ensure_indicator and ensure_year):
        raise Exception("There('re/'s) Invalid Data from CLEANED DATA")
    
    
    
validate_data_1()

Ensuring cleaned data is a subset of master data . . .


In [25]:

merged_cleaned_and_master_data = pd.merge(cross_merged_master,
                                          melted_cleaned_data, 
                                          on=["Area", "Indicator Code", "Year"], how="left", )

merged_cleaned_and_master_data

Unnamed: 0,Area Code,Area,Indicator Code,Indicator Name,Unit,Year,Value
0,11,Aceh,TEL.EXP.HH.PROV.CONSUM.TEL,Average Monthly Household Consumption for Tele...,Rupiah,2018,
1,11,Aceh,TEL.EXP.HH.PROV.CONSUM.TEL,Average Monthly Household Consumption for Tele...,Rupiah,2019,
2,11,Aceh,TEL.EXP.HH.PROV.CONSUM.TEL,Average Monthly Household Consumption for Tele...,Rupiah,2020,
3,11,Aceh,TEL.EXP.HH.PROV.CONSUM.TEL,Average Monthly Household Consumption for Tele...,Rupiah,2021,
4,11,Aceh,TEL.EXP.HH.PROV.CONSUM.TEL,Average Monthly Household Consumption for Tele...,Rupiah,2022,
...,...,...,...,...,...,...,...
42835,00,Indonesia,TEL.OWN.MOB.URB.RES.PRIV,Percentage of Households Owning/Controlling Mo...,%,2019,
42836,00,Indonesia,TEL.OWN.MOB.URB.RES.PRIV,Percentage of Households Owning/Controlling Mo...,%,2020,71.3
42837,00,Indonesia,TEL.OWN.MOB.URB.RES.PRIV,Percentage of Households Owning/Controlling Mo...,%,2021,72.89
42838,00,Indonesia,TEL.OWN.MOB.URB.RES.PRIV,Percentage of Households Owning/Controlling Mo...,%,2022,73.2


In [27]:
def convert_value_dataframe(cleaned_data:pd.DataFrame):
  def preprocess_value(unit, value):
    if isinstance(value, float) and math.isnan(value) : return np.nan
    # if isinstance(value, float)  : return value
    if value == "-" : return np.nan
    match unit:
        case "%":
            return str(value).replace(",", ".")
        case "Average":
            return str(value).replace(",", ".")
        case "Count":
            try:
                return int(value)
            except ValueError:
                return str(value).replace(".", "").replace(",", "")
        case "Rupiah":
            return str(value).replace("", "")
        case _:
            raise Exception(f"Invalid Unit '{unit}'")
  
  
  converted_result = cleaned_data.copy(deep=True)
  converted_result.Value = converted_result.apply(lambda row: preprocess_value(row.Unit, row.Value), axis=1).astype(float)
  
  return converted_result 

# CALL FUNCTION
converted_value_data = convert_value_dataframe(merged_cleaned_and_master_data)
# converted_value_data


In [28]:
def write_data_to_sps(creds, sps_id:str, worksheet_name:str, df:pd.DataFrame):
    client = gspread.authorize(creds)

    spreadsheet = client.open_by_key(sps_id)
    worksheet = spreadsheet.worksheet(worksheet_name)
    worksheet.clear()
    
    area_as_a_text_format = {
        "numberFormat": {
            "type": "TEXT"
        }
    }
    num_rows = len(df) + 1
    column_range = f'A2:A{num_rows}'
    worksheet.format(column_range, area_as_a_text_format)
    
    gspread_dataframe.set_with_dataframe(worksheet=worksheet, dataframe=df)
    border_style = {
        "style": "SOLID",
        "width": 1,
    }

    formatting = {
        "borders": {
          "top": border_style,
          "bottom": border_style,
          "left": border_style,
          "right": border_style
        }
    }
    num_rows = len(df) + 1
    num_cols = len(df.columns)
    range_to_border = f"A1:{gspread.utils.rowcol_to_a1(num_rows, num_cols)}"
    worksheet.format(range_to_border, formatting)
    
    bold_format = {
        "textFormat":{
            "bold":True
        }
    }
    range_to_bold = f"A1:{gspread.utils.rowcol_to_a1(1, len(df.columns))}"
    worksheet.format(range_to_bold, bold_format)
    
    requests = [
        {
            "autoResizeDimensions": {
                "dimensions": {
                    "sheetId": worksheet.id, # The numeric ID of the worksheet
                    "dimension": "COLUMNS",
                    "startIndex": 0,
                    "endIndex": len(df.columns)
                }
            }
        }
    ]

    # Send the batch update request to the spreadsheet
    spreadsheet.batch_update(body={'requests': requests})
    
    return df

# CALL FUNCTION
converted_data_from_sps = write_data_to_sps(creds, settings.MERGED_DATA_SPS_ID, "main",converted_value_data )
converted_data_from_sps

Unnamed: 0,Area Code,Area,Indicator Code,Indicator Name,Unit,Year,Value
0,11,Aceh,TEL.EXP.HH.PROV.CONSUM.TEL,Average Monthly Household Consumption for Tele...,Rupiah,2018,
1,11,Aceh,TEL.EXP.HH.PROV.CONSUM.TEL,Average Monthly Household Consumption for Tele...,Rupiah,2019,
2,11,Aceh,TEL.EXP.HH.PROV.CONSUM.TEL,Average Monthly Household Consumption for Tele...,Rupiah,2020,
3,11,Aceh,TEL.EXP.HH.PROV.CONSUM.TEL,Average Monthly Household Consumption for Tele...,Rupiah,2021,
4,11,Aceh,TEL.EXP.HH.PROV.CONSUM.TEL,Average Monthly Household Consumption for Tele...,Rupiah,2022,
...,...,...,...,...,...,...,...
42835,00,Indonesia,TEL.OWN.MOB.URB.RES.PRIV,Percentage of Households Owning/Controlling Mo...,%,2019,
42836,00,Indonesia,TEL.OWN.MOB.URB.RES.PRIV,Percentage of Households Owning/Controlling Mo...,%,2020,71.30
42837,00,Indonesia,TEL.OWN.MOB.URB.RES.PRIV,Percentage of Households Owning/Controlling Mo...,%,2021,72.89
42838,00,Indonesia,TEL.OWN.MOB.URB.RES.PRIV,Percentage of Households Owning/Controlling Mo...,%,2022,73.20


## Converted Data -> Data Warehouse

In [29]:
def get_dataframe_from_sheet(creds, sps_id:str, worksheet_name:str):
    client = gspread.authorize(creds)

    spreadsheet = client.open_by_key(sps_id)
    worksheet = spreadsheet.worksheet(worksheet_name)
    df = gspread_dataframe.get_as_dataframe(worksheet=worksheet)
    
    return df

#CALL FUNCTION
converted_and_merged_data = get_dataframe_from_sheet(creds, settings.MERGED_DATA_SPS_ID, "main")
dim_location = get_dataframe_from_sheet(creds, settings.WAREHOUSE_DATA_SPS_ID, "dim_location").drop(columns="id")
dim_indicator = get_dataframe_from_sheet(creds, settings.WAREHOUSE_DATA_SPS_ID, "dim_indicator").drop(columns="id")
dim_year = get_dataframe_from_sheet(creds, settings.WAREHOUSE_DATA_SPS_ID, "dim_year").drop(columns="id")


In [30]:
dim_location_column = [
    # "id",
    "area_code",
    "area_name",
    "area_type",
    "region_code",
    "region_name",
    "income_level_code",
    "income_level_name"
]

dim_indicator_column = [
    # "id",
    "indicator_code",
    "indicator_name",
    "theme_name",
    "technology_code",
    "technology_name",
    "category_code",
    "category_name",
    "unit",
    "new_category_code",
    "new_category_name"
]

dim_year_column = [
    # "id",
    "year",
    "note"
]

fact_it_eco_column = [
    "dim_year_id",
    "dim_indicator_id",
    "dim_location_id",
    "value"
]

default_null_value = "--EMPTY VALUE DATA--"

In [31]:
dim_year.columns = dim_year_column
dim_indicator.columns = dim_indicator_column
dim_location.columns = dim_location_column

In [32]:
# Handle dim_year

dim_year[["year"]] = dim_year[["year"]].astype(int).astype(str)



# Implementation of SCD TYPE 1
rename_source_year = master_year_df.fillna(default_null_value).rename(columns={"Year":"year", "Notes":"note"})
drop_duplicates_source_year = rename_source_year.drop_duplicates(subset=["year"], keep="last").set_index("year")
dim_year_buskey_as_index = dim_year.set_index("year")
dim_year_buskey_as_index.update(drop_duplicates_source_year)

# ADD NEW DATA
new_year = drop_duplicates_source_year[~drop_duplicates_source_year.index.isin(dim_year_buskey_as_index.index)]
concatenated_dim_year = pd.concat([dim_year_buskey_as_index, new_year])

# WRITE TO SPS
final_dim_year = concatenated_dim_year.reset_index()
final_dim_year.insert(0, 'id', range(1, len(final_dim_year)+1))
write_data_to_sps(creds, settings.WAREHOUSE_DATA_SPS_ID, "dim_year", final_dim_year)


final_dim_year

Unnamed: 0,id,year,note
0,1,2018,--EMPTY VALUE DATA--
1,2,2019,--EMPTY VALUE DATA--
2,3,2020,--EMPTY VALUE DATA--
3,4,2021,--EMPTY VALUE DATA--
4,5,2022,--EMPTY VALUE DATA--
5,6,2023,--EMPTY VALUE DATA--


In [33]:
# HANDLE dim_location
if not set(master_inc_province_df["Provinsi"]).issubset(set(master_area_df["AREA_NAME"])):
    raise Exception("Invalid Province Name Found")

dim_location["area_code"] = dim_location["area_code"].astype(int).astype(str).apply(lambda dt: dt.zfill(2))

# RENAMED MASTER
renamed_master_inc =  master_inc_province_df.rename(columns={"Provinsi":"area_name",
                                                     "Tingkat_Pendapatan":"income_level_name",
                                                     "ID_Pendapatan":"income_level_code"})
renamed_master_area = master_area_df.rename(columns={ "ID":"area_code",
                                             "AREA_NAME":"area_name",
                                             "AREA_TYPE":"area_type",
                                             "REGION_GROUP":"region_name",
                                             "ID_REGION":"region_code"
                                             })

# MERGE AREA
merged_area =  renamed_master_area.merge(renamed_master_inc,on=["area_name"], how="left")[dim_location.columns]

# Implementation of SCD TYPE 1
rename_source_area = merged_area.fillna(default_null_value)
drop_duplicates_source_area = rename_source_area.drop_duplicates(subset=["area_code"], keep="last").set_index("area_code")
dim_location_buskey_as_index = dim_location.set_index("area_code")
dim_location_buskey_as_index.update(drop_duplicates_source_area)

# ADD NEW DATA
new_location = drop_duplicates_source_area[~drop_duplicates_source_area.index.isin(dim_location_buskey_as_index.index)]
concatenated_dim_location = pd.concat([dim_location_buskey_as_index, new_location])

# WRITE TO SPS
final_dim_location = concatenated_dim_location.reset_index()
final_dim_location.insert(0, 'id', range(1, len(final_dim_location)+1))
write_data_to_sps(creds, settings.WAREHOUSE_DATA_SPS_ID, "dim_location", final_dim_location)


Unnamed: 0,id,area_code,area_name,area_type,region_code,region_name,income_level_code,income_level_name
0,1,11,Aceh,Province,SM,Sumatra,LMI,Pendapatan Menengah Bawah
1,2,12,Sumatera Utara,Province,SM,Sumatra,UMI,Pendapatan Menengah Atas
2,3,13,Sumatera Barat,Province,SM,Sumatra,LMI,Pendapatan Menengah Bawah
3,4,14,Riau,Province,SM,Sumatra,UMI,Pendapatan Menengah Atas
4,5,15,Jambi,Province,SM,Sumatra,UMI,Pendapatan Menengah Atas
5,6,16,Sumatera Selatan,Province,SM,Sumatra,UMI,Pendapatan Menengah Atas
6,7,17,Bengkulu,Province,SM,Sumatra,LMI,Pendapatan Menengah Bawah
7,8,18,Lampung,Province,SM,Sumatra,LMI,Pendapatan Menengah Bawah
8,9,19,Kepulauan Bangka Belitung,Province,SM,Sumatra,UMI,Pendapatan Menengah Atas
9,10,21,Kepulauan Riau,Province,SM,Sumatra,UMI,Pendapatan Menengah Atas


In [34]:
# HANDLE dim_indicator
master_indicator_df

# IMPLEMENTATION OF SCD TYPE 1
rename_source_indicator = master_indicator_df.fillna(default_null_value).rename(columns={"Indicator_Code":"indicator_code",
                                                                                         "Indicator_Name":"indicator_name",
                                                                                         "Theme":"theme_name",
                                                                                         "Technology":"technology_name",
                                                                                         "Tech_ID":"technology_code",
                                                                                         "Category":"category_name",
                                                                                         "Category_ID":"category_code",
                                                                                         "Unit":"unit",
                                                                                         "Category_ID.1":"new_category_code",
                                                                                         "New_Category":"new_category_name"})[dim_indicator.columns]
drop_duplicates_source_indicator = rename_source_indicator.drop_duplicates(subset=["indicator_code"], keep="last").set_index("indicator_code")
dim_indicator_buskey_as_index = dim_indicator.set_index("indicator_code")
dim_indicator_buskey_as_index.update(drop_duplicates_source_indicator)


# ADD NEW DATA
new_indicator = drop_duplicates_source_indicator[~drop_duplicates_source_indicator.index.isin(dim_indicator_buskey_as_index.index)]
concatenated_dim_indicator = pd.concat([dim_indicator_buskey_as_index, new_indicator])

# WRITE TO SPS
final_dim_indicator = concatenated_dim_indicator.reset_index()
final_dim_indicator.insert(0, 'id', range(1, len(final_dim_indicator)+1))
write_data_to_sps(creds, settings.WAREHOUSE_DATA_SPS_ID, "dim_indicator", final_dim_indicator)

Unnamed: 0,id,indicator_code,indicator_name,theme_name,technology_code,technology_name,category_code,category_name,unit,new_category_code,new_category_name
0,1,INF.PROV.INTERNET.SIG,Number of Villages by Province and Cell Phone ...,Telecommunication Infrastructure,BTS,BTS Infrastructure,V.PROV.INTSIG,Villages (All) – Internet Signal,Count,--EMPTY VALUE DATA--,--EMPTY VALUE DATA--
1,2,INF.BTS.PROV,Number of Villages with BTS Towers by Province,Telecommunication Infrastructure,BTS,BTS Infrastructure,BTS.PROV,Villages with BTS (All),Count,--EMPTY VALUE DATA--,--EMPTY VALUE DATA--
2,3,INF.BTS.PROV.INTSIG,Number of Villages with BTS Towers and Cell Ph...,Telecommunication Infrastructure,BTS,BTS Infrastructure,BTS.PROV.INTSIG,Villages with BTS + Internet Signal (All),Count,--EMPTY VALUE DATA--,--EMPTY VALUE DATA--
3,4,INF.BTS.PROV.SIG,Number of Villages with BTS Towers and Cell Ph...,Telecommunication Infrastructure,BTS,BTS Infrastructure,BTS.PROV.SIG,Villages with BTS + Signal (All),Count,--EMPTY VALUE DATA--,--EMPTY VALUE DATA--
4,5,INF.BTS.RUR,Number of Villages with BTS Towers in Rural Ar...,Telecommunication Infrastructure,BTS,BTS Infrastructure,BTS.RUR,Villages with BTS (Rural),Count,--EMPTY VALUE DATA--,--EMPTY VALUE DATA--
...,...,...,...,...,...,...,...,...,...,...,...
220,221,TEL.OWN.MOB.URB.RES.OFFICIAL,Percentage of Households Owning/Controlling Mo...,--EMPTY VALUE DATA--,TEL,Telephone,--EMPTY VALUE DATA--,--EMPTY VALUE DATA--,%,--EMPTY VALUE DATA--,--EMPTY VALUE DATA--
221,222,TEL.OWN.MOB.URB.RES.RENT,Percentage of Households Owning/Controlling Mo...,--EMPTY VALUE DATA--,TEL,Telephone,--EMPTY VALUE DATA--,--EMPTY VALUE DATA--,%,--EMPTY VALUE DATA--,--EMPTY VALUE DATA--
222,223,TEL.OWN.MOB.URB.RES.OTHERS,Percentage of Households Owning/Controlling Mo...,--EMPTY VALUE DATA--,TEL,Telephone,--EMPTY VALUE DATA--,--EMPTY VALUE DATA--,%,--EMPTY VALUE DATA--,--EMPTY VALUE DATA--
223,224,TEL.OWN.MOB.URB.RES.FREE,Percentage of Households Owning/Controlling Mo...,--EMPTY VALUE DATA--,TEL,Telephone,--EMPTY VALUE DATA--,--EMPTY VALUE DATA--,%,--EMPTY VALUE DATA--,--EMPTY VALUE DATA--


In [35]:
# HANDLE fact_it_eco

# RENAME SOURCE
renamed_source = converted_and_merged_data.rename(columns={"Area Code":"area_code",
                                                   "Area":"area_name",
                                                   "Indicator Code":"indicator_code",
                                                   "Indicator Name":"indicator_name",
                                                   "Unit":"unit",
                                                   "Year":"year",
                                                   "Value":"value"})


# FORMAT SOURCE
renamed_source["area_code"] = renamed_source["area_code"].astype(int).astype(str).apply(lambda x : x.zfill(2))
renamed_source["year"] = renamed_source["year"].astype(int).astype(str)

# renamed_source
enriched_df = pd.merge(renamed_source.drop(columns="area_name"), final_dim_location, on="area_code").rename(columns={"id":"dim_location_id"})
enriched_df = pd.merge(enriched_df, final_dim_year, on="year").rename(columns={"id":"dim_year_id"})
enriched_df = pd.merge(enriched_df.drop(columns=["indicator_name", "unit"]), final_dim_indicator, on="indicator_code").rename(columns={"id":"dim_indicator_id"})

fact_table = enriched_df[fact_it_eco_column]
write_data_to_sps(creds, settings.WAREHOUSE_DATA_SPS_ID, "fact_it_ecosystem", fact_table)

Unnamed: 0,dim_year_id,dim_indicator_id,dim_location_id,value
0,1,99,1,
1,2,99,1,
2,3,99,1,
3,4,99,1,
4,5,99,1,
...,...,...,...,...
42835,2,225,35,
42836,3,225,35,71.30
42837,4,225,35,72.89
42838,5,225,35,73.20
