## Merge Datasets Playbook 


In [None]:
import pandas as pd
import pyarrow as pa
import re

## Social Media Dataset

In [None]:

df_sm = pd.read_csv('20230302 SocialMediaIndex.csv')
df_sm.head()

## Mental Health Dataset

In [None]:
xls_file = pd.ExcelFile('Mental health Depression disorder Data.xlsx')
page_list = []

#### For each page on the excel sheet:
- Load to a dataframe
- Filter out rows with year different from 'yyyy' 
- Add the dataframe to a list

In [None]:
year_regex = re.compile(r'^\d{4}$')

for page_name in xls_file.sheet_names:

    df = pd.read_excel(xls_file, page_name, engine='openpyxl')
    valid_years_mask = df['Year'].astype(str).apply(lambda x: bool(year_regex.match(x)))
    filtered_df = df[valid_years_mask]

    page_list.append(filtered_df)

#### Building the general dataset
- Merging the different dataframes from each page by the columns 'Entity', 'Year', 'Code'
- Removing columns with all values empty
- removing duplicate columns

In [None]:
merged_df = page_list[0]
for i in range(1, len(page_list)):
    merged_df = pd.merge(
        merged_df,
        page_list[i],
        on=['Entity', 'Year', 'Code'],
        how='outer',
        suffixes=('_left', '_right')
    )

merged_df = merged_df.loc[:, ~merged_df.columns.duplicated()]
merged_df = merged_df.dropna(axis=1, how='all')
merged_df.to_csv('merged_dataset.csv', index=False)
merged_df.head()

- Convert the resulting dataset to a parquet file

In [None]:

# Use ParquetDataset to read in all of the files as a single dataset
merged_df.to_parquet('my_data.parquet.gzip', compression='gzip')

parquet_df = pd.read_parquet('my_data.parquet.gzip')
parquet_df.head()
parquet_df.equals(merged_df)

## Inflation Dataset


In [None]:
xls_file = pd.ExcelFile('Inflation-data.xlsx')

In [None]:
year_regex = re.compile(r'^\d{4}$')
anual_regex = re.compile(r'.*_a.*')
page_list = []

for page_name in xls_file.sheet_names:
    if(anual_regex.match(page_name)):

        df = pd.read_excel(xls_file, page_name, engine='openpyxl')
        indicator = df['Series Name'][0]
        print(indicator)
        df = df.drop(columns=['IMF Country Code','Series Name','Indicator Type' ])
        df = df[df['Country Code'].str.len() <= 3]
        df = df[df['Country Code'].str.len() > 0]
        melted_df = df.melt(id_vars=['Country Code', 'Country'], var_name='Year', value_name='Inflation')
        melted_df = melted_df.rename(columns={'Inflation': indicator})
        #melted_df = melted_df.drop(columns=['Series Name'])
        
        page_list.append(melted_df)

page_list[0].head() 

In [None]:
merged_inflation = page_list[0]
for i in range(1, len(page_list)):
    merged_inflation = pd.merge(
        merged_inflation,
        page_list[i],
        on=['Country', 'Year', 'Country Code'],
        how='outer',
        suffixes=('_left', '_right')
    )

merged_inflation = merged_inflation.loc[:, ~merged_inflation.columns.duplicated()]
merged_inflation = merged_inflation.dropna(axis=1, how='all')
merged_inflation.to_csv('merged_dataset_inflation.csv', index=False)
merged_inflation.head()