### Importing libraries

In [1]:
# Importing libraries

# Data treatment
# -----------------------------------------------------------------------
import pandas as pd
import numpy as np

# Path
import sys
sys.path.append('../')

# Config
# -----------------------------------------------------------------------
pd.set_option('display.max_columns', None) # para poder visualizar todas las columnas de los DataFrames

### Data loading

In [2]:
path = "../data/output/data_full.csv"

df = pd.read_csv(path)

In [None]:
# We check theres are no duplicated
df.duplicated().value_counts()

### Data cleaning

In [None]:
# We check important info in the dataframe to decide which columns contain relevant info
df.info()

We see the following info:

* We have 1026299 entries

* Unnamed: 0 is irrelevant since it's an index, so we can remove it.

* 'Superior Agency Code' and 'Superior Agency Name' appear to refer to the same info. Code has more non-null entries so we will want to keep that info. However, since the name is more understandable for our analysis we will change the codes for their corresponding names to fill the empty names.

* 'Agency Code' and 'Agency Name' is the same case as the previous one.


#### Superior Agency

In [None]:
# We build a dataframe that gives us the code for every name. 
# We have to group by Name, convert to a dataframe and keep the Name and Code
df_superior_agency = df[['Superior Agency Code', 'Superior Agency Name']].groupby('Superior Agency Name').value_counts().reset_index()[['Superior Agency Name', 'Superior Agency Code']]

df_superior_agency.head()

In [None]:
# Now we store these columns in a dictionary to rename the codes
superior_agency_dict = dict(zip(df_superior_agency['Superior Agency Code'], df_superior_agency['Superior Agency Name']))

superior_agency_dict

We perform the replacement

In [9]:
# Replacement
df['Superior Agency Code'].replace(superior_agency_dict, inplace = True)

# But we see that we still have some names that had no code
df[df['Superior Agency Code'].isna() & df['Superior Agency Name'].notna()][['Superior Agency Code', 'Superior Agency Name']].head()

# For those values we reassign the name (only for NaN code and not NaN name)
df.loc[df['Superior Agency Code'].isna() & df['Superior Agency Name'].notna(), 'Superior Agency Code'] = df['Superior Agency Name']

In [None]:
# Now Superior Agency Code is clean and we can rename it to just 'Superior Agency' and get rid of Superior Agency Name
df.info()

In [12]:
df.rename(columns={'Superior Agency Code': 'Superior Agency'}, inplace=True)
df.drop(columns=["Superior Agency Name"], inplace = True)

In [None]:
df.sample()

#### Agency

We can do the same with Agency

In [None]:
# We build a dataframe that gives us the code for every name. 
# We have to group by Name, convert to a dataframe and keep the Name and Code
df_agency = df[['Agency Code', 'Agency Name']].groupby('Agency Name').value_counts().reset_index()[['Agency Name', 'Agency Code']]

df_agency.head()

In [None]:
# Now we store these columns in a dictionary to rename the codes
agency_dict = dict(zip(df_agency['Agency Code'], df_agency['Agency Name']))

agency_dict

In [None]:
# Replacement
df['Agency Code'].replace(agency_dict, inplace = True)

# But we see that we still have some names that had no code
df[df['Agency Code'].isna() & df['Agency Name'].notna()][['Agency Code', 'Agency Name']].head()

# For those values we reassign the name (only for NaN code and not NaN name)
df.loc[df['Agency Code'].isna() & df['Agency Name'].notna(), 'Agency Code'] = df['Agency Name']

In [None]:
# Now Agency Code is clean and we can rename it to just 'Agency' and get rid of Agency Name
df.info()

In [19]:
df.rename(columns={'Agency Code': 'Agency'}, inplace=True)
df.drop(columns=["Agency Name"], inplace = True)

In [None]:
df.sample()

#### Managing unit

We can do the same with Managing Unit

In [None]:
# We build a dataframe that gives us the code for every name. 
# We have to group by Name, convert to a dataframe and keep the Name and Code
df_MU = df[['Managing Unit Code', 'Managing Unit Name']].groupby('Managing Unit Name').value_counts().reset_index()[['Managing Unit Name', 'Managing Unit Code']]

df_MU.head()

In [None]:
# Now we store these columns in a dictionary to rename the codes
mu_dict = dict(zip(df_MU['Managing Unit Code'], df_MU['Managing Unit Name']))

mu_dict

In [None]:
# Replacement
df['Managing Unit Code'].replace(mu_dict, inplace = True)

# But we see that we still have some names that had no code
df[df['Managing Unit Code'].isna() & df['Managing Unit Name'].notna()][['Managing Unit Code', 'Managing Unit Name']].head()

# For those values we reassign the name (only for NaN code and not NaN name)
df.loc[df['Managing Unit Code'].isna() & df['Managing Unit Name'].notna(), 'Managing Unit Code'] = df['Managing Unit Name']

In [None]:
# Now Managing Unit Code is clean and we can rename it to just 'MAnaging Unit' and get rid of Managing Unit Name
df.info()

In [26]:
df.rename(columns={'Managing Code': 'Managing Unit'}, inplace=True)
df.drop(columns=["Managing Unit Name"], inplace = True)

In [None]:
df.sample()

In [None]:
df.info()

---

Now it's time to convert to their proper type the following columns:

* Updated Budgeted Amount (numeric)

* Posted Amount (numeric)

* Actual Amount (numeric)

* Realization Percentage (numeric)

* Posting Date (datetime)

In [29]:
categories = ['Updated Budgeted Amount', 'Posted Amount', 'Actual Amount', 'Realization Percentage', 'Posting Date']

for cat in categories:

    if cat == 'Posting Date':
        df[cat] = pd.to_datetime(df[cat], dayfirst=True)

    else:
        df[cat] = df[cat].str.replace(',', '.').astype(float).replace(0, np.nan)

We need to address some inconveniences such as:

* Replacing commas with dots in '0,00' for proper float conversion

* Replacing nan with 0 for effective data cleaning

In [None]:
df.info()

In [None]:
df['Updated Budgeted Amount'].isna().value_counts()

In [None]:
for cat in categories:

    if cat == 'Posting Date':
        # This is not numerical
        continue

    print(f'{cat}: {round((df[(df[cat].isna())].shape[0]) / (df.shape[0]) * 100, 2)}%')

In [56]:
# Now we save the dataframe
df.to_csv("../data/output/data_clean.csv", index = False)