## Merge Datasets Playbook 


In [1]:
import pandas as pd
import pyarrow as pa
import re

## Social Media Dataset

In [13]:

df_sm = pd.read_csv('social_media-ww-yearly-2009-2023.csv')
df_sm = df_sm.rename(columns={'Date': 'Year'})
df_sm

Unnamed: 0,Year,Facebook,Pinterest,Twitter,StumbleUpon,YouTube,Tumblr,reddit,Instagram,VKontakte,...,MySpace,Fark,NowPublic,iWiW,news.ycombinator.com,Delicious,orkut,Odnoklassniki,Vimeo,Other
0,2009,48.14,0.0,7.22,22.4,4.09,0.0,5.76,0.0,0.0,...,4.02,0.27,0.07,1.43,0.07,0.38,0.91,0.0,0.01,1.35
1,2010,59.58,0.0,6.35,18.7,6.62,0.0,3.26,0.0,0.0,...,1.25,0.24,0.59,0.49,0.09,0.26,0.25,0.0,0.11,0.45
2,2011,65.83,0.0,4.41,20.75,5.12,0.0,2.3,0.0,0.16,...,0.05,0.12,0.21,0.04,0.06,0.08,0.06,0.01,0.02,0.18
3,2012,65.33,7.18,6.55,8.8,7.54,0.0,3.21,0.0,0.35,...,0.01,0.1,0.17,0.01,0.07,0.03,0.02,0.04,0.03,0.12
4,2013,66.35,10.96,9.73,4.22,2.98,2.11,2.5,0.0,0.37,...,0.0,0.08,0.08,0.0,0.01,0.01,0.01,0.1,0.02,0.04
5,2014,73.28,9.38,6.83,1.85,1.06,5.22,1.77,0.0,0.19,...,0.0,0.03,0.0,0.0,0.01,0.01,0.0,0.05,0.01,0.01
6,2015,83.5,6.57,5.32,0.84,0.11,1.93,1.21,0.0,0.12,...,0.0,0.01,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.01
7,2016,86.16,6.17,4.2,0.42,0.27,1.43,0.7,0.09,0.23,...,0.0,0.04,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0
8,2017,83.6,7.39,4.64,0.26,1.46,0.86,0.53,0.74,0.16,...,0.0,0.03,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.01
9,2018,68.61,13.0,7.15,0.16,6.92,0.86,0.5,1.95,0.26,...,0.0,0.02,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.03


## Mental Health Dataset

In [3]:
xls_file = pd.ExcelFile('Mental health Depression disorder Data.xlsx')
page_list = []

#### For each page on the excel sheet:
- Load to a dataframe
- Filter out rows with year different from 'yyyy' 
- Add the dataframe to a list

In [4]:
year_regex = re.compile(r'^\d{4}$')

for page_name in xls_file.sheet_names:

    df = pd.read_excel(xls_file, page_name, engine='openpyxl')
    valid_years_mask = df['Year'].astype(str).apply(lambda x: bool(year_regex.match(x)))
    filtered_df = df[valid_years_mask]

    page_list.append(filtered_df)

#### Building the general dataset
- Merging the different dataframes from each page by the columns 'Entity', 'Year', 'Code'
- Removing columns with all values empty
- removing duplicate columns

In [5]:
merged_mh = page_list[0]
for i in range(1, len(page_list)):
    merged_mh = pd.merge(
        merged_mh,
        page_list[i],
        on=['Entity', 'Year', 'Code'],
        how='outer',
        suffixes=('_left', '_right')
    )

merged_mh = merged_mh.rename(columns={'Entity': 'Country', 'Code': 'Country Code'})
merged_mh = merged_mh.loc[:, ~merged_mh.columns.duplicated()]
merged_mh = merged_mh.dropna(axis=1, how='all')
merged_mh.to_csv('merged_health.csv', index=False)
merged_mh.head()

  key_col = Index(lvals).where(~mask_left, rvals)


Unnamed: 0,Country,Country Code,Year,Schizophrenia (%),Bipolar disorder (%),Eating disorders (%),Anxiety disorders (%),Drug use disorders (%),Depression (%),Alcohol use disorders (%),...,50-69 years old (%),Age-standardized (%),15-49 years old (%),Prevalence in males (%),Prevalence in females (%),Population_left,"Suicide rate (deaths per 100,000 individuals)","Depressive disorder rates (number suffering per 100,000)",Population_right,Prevalence - Depressive disorders - Sex: Both - Age: All Ages (Number) (people suffering from depression)
0,Afghanistan,AFG,1990.0,0.16056,0.697779,0.101855,4.82883,1.677082,4.071831,0.672404,...,5.917752,4.071831,4.939766,3.499982,4.647815,12412000.0,10.318504,4039.755763,12412000.0,318435.81367
1,Afghanistan,AFG,1991.0,0.160312,0.697961,0.099313,4.82974,1.684746,4.079531,0.671768,...,5.927093,4.079531,4.902682,3.503947,4.655772,13299000.0,10.32701,4046.256034,13299000.0,329044.773956
2,Afghanistan,AFG,1992.0,0.160135,0.698107,0.096692,4.831108,1.694334,4.088358,0.670644,...,5.945656,4.088358,4.837097,3.508912,4.662066,14486000.0,10.271411,4053.709902,14486000.0,382544.572895
3,Afghanistan,AFG,1993.0,0.160037,0.698257,0.094336,4.830864,1.70532,4.09619,0.669738,...,5.966915,4.09619,4.813657,3.513429,4.669012,15817000.0,10.376123,4060.203474,15817000.0,440381.507393
4,Afghanistan,AFG,1994.0,0.160022,0.698469,0.092439,4.829423,1.716069,4.099582,0.66926,...,5.975907,4.099582,4.83934,3.515578,4.67305,17076000.0,10.575915,4062.290365,17076000.0,456916.645489


- Convert the resulting dataset to a parquet file

In [6]:

# Use ParquetDataset to read in all of the files as a single dataset
merged_mh.to_parquet('my_data.parquet.gzip', compression='gzip')

parquet_mh_df = pd.read_parquet('my_data.parquet.gzip')
parquet_mh_df.head()
parquet_mh_df.equals(merged_mh)

False

## Inflation Dataset


In [7]:
xls_file = pd.ExcelFile('Inflation-data.xlsx')

In [8]:
year_regex = re.compile(r'^\d{4}$')
anual_regex = re.compile(r'.*_a.*')
page_list = []

for page_name in xls_file.sheet_names:
    if(anual_regex.match(page_name)):

        df = pd.read_excel(xls_file, page_name, engine='openpyxl')
        indicator = df['Series Name'][0]
        print(indicator)
        df = df.drop(columns=['IMF Country Code','Series Name','Indicator Type' ])
        df = df[df['Country Code'].str.len() <= 3]
        df = df[df['Country Code'].str.len() > 0]
        melted_df = df.melt(id_vars=['Country Code', 'Country'], var_name='Year', value_name='Inflation')
        melted_df = melted_df.rename(columns={'Inflation': indicator})
        #melted_df = melted_df.drop(columns=['Series Name'])
        
        page_list.append(melted_df)

page_list[0].head() 

Headline Consumer Price Inflation
Energy Consumer Price Inflation
Food Consumer Price Inflation
Official Core Consumer Price Inflation
Producer Price Inflation
GDP deflator Index growth rate
Estimated Core Inflation, Year-on-Year, Annual average


Unnamed: 0,Country Code,Country,Year,Headline Consumer Price Inflation
0,ABW,Aruba,1970,
1,AFG,Afghanistan,1970,25.51
2,AGO,Angola,1970,7.97
3,ALB,Albania,1970,
4,ARE,United Arab Emirates,1970,21.984699


In [9]:
merged_inflation = page_list[0]
for i in range(1, len(page_list)):
    merged_inflation = pd.merge(
        merged_inflation,
        page_list[i],
        on=['Country', 'Year', 'Country Code'],
        how='outer',
        suffixes=('_left', '_right')
    )

merged_inflation = merged_inflation.loc[:, ~merged_inflation.columns.duplicated()]
merged_inflation = merged_inflation.dropna(axis=1, how='all')
merged_inflation.to_csv('merged_inflation.csv', index=False)
merged_inflation.head()

Unnamed: 0,Country Code,Country,Year,Headline Consumer Price Inflation,Energy Consumer Price Inflation,Food Consumer Price Inflation,Official Core Consumer Price Inflation,Producer Price Inflation,GDP deflator Index growth rate,"Estimated Core Inflation, Year-on-Year, Annual average"
0,ABW,Aruba,1970,,,,,,6.47799,
1,AFG,Afghanistan,1970,25.51,,,,,,
2,AGO,Angola,1970,7.97,,,,,,
3,ALB,Albania,1970,,,,,,,
4,ARE,United Arab Emirates,1970,21.984699,,,,,,


### Merging all Datasets

In [14]:

merged_df = pd.merge(merged_mh, merged_inflation, on=['Country', 'Country Code','Year'])
merged_df = pd.merge(merged_df,df_sm, on=['Year'])
merged_df

Unnamed: 0,Country,Country Code,Year,Schizophrenia (%),Bipolar disorder (%),Eating disorders (%),Anxiety disorders (%),Drug use disorders (%),Depression (%),Alcohol use disorders (%),...,MySpace,Fark,NowPublic,iWiW,news.ycombinator.com,Delicious,orkut,Odnoklassniki,Vimeo,Other
0,Afghanistan,AFG,2009.0,0.164932,0.704925,0.095166,4.861533,2.543884,4.129972,0.661185,...,4.02,0.27,0.07,1.43,0.07,0.38,0.91,0.0,0.01,1.35
1,Albania,ALB,2009.0,0.198905,0.702783,0.157738,3.391018,0.498324,2.206033,1.824955,...,4.02,0.27,0.07,1.43,0.07,0.38,0.91,0.0,0.01,1.35
2,Algeria,DZA,2009.0,0.196795,0.814617,0.202695,5.037067,1.654478,3.616783,0.661914,...,4.02,0.27,0.07,1.43,0.07,0.38,0.91,0.0,0.01,1.35
3,Andorra,AND,2009.0,0.264457,0.961373,0.647337,5.283986,0.901957,3.707962,1.250261,...,4.02,0.27,0.07,1.43,0.07,0.38,0.91,0.0,0.01,1.35
4,Angola,AGO,2009.0,0.169454,0.622038,0.152570,3.274588,0.502140,4.157654,1.387104,...,4.02,0.27,0.07,1.43,0.07,0.38,0.91,0.0,0.01,1.35
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2041,Uzbekistan,UZB,2019,,,,,,,,...,0.00,0.01,0.00,0.00,0.02,0.00,0.00,0.0,0.00,0.00
2042,Vanuatu,VUT,2019,,,,,,,,...,0.00,0.01,0.00,0.00,0.02,0.00,0.00,0.0,0.00,0.00
2043,Vietnam,VNM,2019,,,,,,,,...,0.00,0.01,0.00,0.00,0.02,0.00,0.00,0.0,0.00,0.00
2044,Zambia,ZMB,2019,,,,,,,,...,0.00,0.01,0.00,0.00,0.02,0.00,0.00,0.0,0.00,0.00
