# Dataset Mashup

In [None]:
import pandas as pd

In [None]:
# importing files
# D1_1 -> Total mortality in prison
df_D1_1 = pd.read_csv('processed_data/D1.1.csv',sep=';')
# D1_2x -> Death with reason 1/2/3/4
df_D1_21 = pd.read_csv('processed_data/D1.21.csv',sep=';')
df_D1_22 = pd.read_csv('processed_data/D1.22.csv',sep=';')
df_D1_23 = pd.read_csv('processed_data/D1.23.csv',sep=';')
df_D1_24 = pd.read_csv('processed_data/D1.24.csv',sep=';')

# D2_1x -> People held in prison total/Male/Female
df_D2_10 = pd.read_csv('processed_data/D2.10.csv',sep=';')  #D2 is not structured correctly
df_D2_11 = pd.read_csv('processed_data/D2.11.csv',sep=';')
df_D2_12 = pd.read_csv('processed_data/D2.12.csv',sep=';')
# D2_2x -> People held unsentenced with timespan 1/2/3/4
df_D2_21 = pd.read_csv('processed_data/D2.21.csv',sep=';')
df_D2_22 = pd.read_csv('processed_data/D2.22.csv',sep=';')
df_D2_23 = pd.read_csv('processed_data/D2.23.csv',sep=';')
df_D2_24 = pd.read_csv('processed_data/D2.24.csv',sep=';')

# D3 -> Official prison capacity per 100 thousand inhabitants and the actual population of the prison
df_D3 = pd.read_csv('processed_data/D3.csv',sep=';')

# D4 -> Percieved perception of the justice system 
df_D4 = pd.read_csv('processed_data/D4.csv',sep=';')

# D5 -> Corruption perception index (unit scale 0 - 100 where 0 being `highly corrupt`)
df_D5 = pd.read_csv('processed_data/D5.csv',sep=';')

# D7 ->  General Government Sector's annual expenditure on 
#        [GF03] Public order and safety 
#        [GF0301] Police services 
#        [GF0303] Law courts 
#        [GF0304] Prisons
#        [GF0305] R&D Public order and safety 
#        [GF0306] Public order and safety n.e.c.
df_D7 = pd.read_csv('processed_data/D7.csv',sep=';')

df_D1_23.head()

<hr>

## Helper Functions 

In [None]:
#defining some handy functions
def col_unique_val(df,colname):
    uniq_vals = []
    for index,row in df.iterrows():
        for col,value in row.items():
            if col == colname and value not in uniq_vals:
                uniq_vals.append(value)
    return uniq_vals

def row_values(df,row_index):
    values = df.loc[row_index].values
    return values

In [None]:
# write the function to drop select columns from a df
def col_drop(df):
    years = ['2001','2002','2003','2004','2005','2006','2007','2008','2009','2010','2011','2012','2013','2014','2015','2021','2022']
    to_keep = [col for col in df.columns if col.strip() not in years] #we use the strip to remove blank spaces around the value.
    df = df[to_keep]
    return df

#function to remove space characters from your strings (useful to make correct string matches)
def spaceRemover(df):
    for column_name in df.columns:
        if ' ' in column_name:
            output_str = column_name.strip().replace(" ", "")
            df = df.rename(columns={column_name:output_str})
    return df

In [None]:
#function to replace values of a column in a df using a dictionary for new values
def replaceValue(df,col,mapping_dict):
    for col in df.columns:
        if df[col].dtype == 'object':
            df[col] = df[col].apply(lambda x: mapping_dict.get(x, x))
    return df

#the function to remove unwanted countries from the dataset.
def countryRem(df,cntryList,col):
    dfgrp = df.groupby(col)
    resultdf = pd.DataFrame()
    for cntry in dfgrp.groups:
        if cntry in cntryList:
            dftemp = dfgrp.get_group(cntry)
            resultdf = pd.concat([resultdf,dftemp])
        
    resultdf = resultdf.reset_index(drop=True)
    return resultdf

<hr>

## Data Filtering and Cleaning

At this stage, we set the timespan we are going to be working with ->  `2016 - 2021` <br>
We also define the countries we are interested in for each sub-region of europe,for sake of simplicity, lets pick 5 countries from each sub-region. Please note that these allocation of any country to a subregions is `purely on the basis of their geography`.

In [None]:
#droping the non-required years for all the dfs
df_D1_1 = col_drop(df_D1_1)
df_D1_21 = col_drop(df_D1_21)
df_D1_22 = col_drop(df_D1_22)
df_D1_23 = col_drop(df_D1_23)
df_D1_24 = col_drop(df_D1_24)

df_D2_10 = col_drop(df_D2_10)
df_D2_11 = col_drop(df_D2_11)
df_D2_12 = col_drop(df_D2_12)
df_D2_21 = col_drop(df_D2_21)
df_D2_22 = col_drop(df_D2_22)
df_D2_23 = col_drop(df_D2_23)
df_D2_24 = col_drop(df_D2_24)

df_D3 = col_drop(df_D3)
df_D4 = col_drop(df_D4)
df_D5 = col_drop(df_D5)
df_D7 = col_drop(df_D7)

df_D2_21.head()

**Let's reduce the number of countries we are working with. Here is the list of 5 countries from each region of europe which we have choosen to include in our project.**

> **North_E   &#8594;** The selected countries have data in all the datasets<br>
    - Estonia <br>
    - Sweden <br>
    - Finland <br>
    - Denmark <br>
    - Ireland <br>
    
> **East_E   &#8594;** The selected countries have data in all the datasets<br>
    - Bulgaria <br>
    - Hungary <br>
    - Poland <br>
    - Romania <br>
    - Slovakia <br>
    
> **South_E   &#8594;** The selected countries have data in all the datasets<br>
    - Slovenia <br>
    - Greece <br>
    - Italy <br>
    - Portugal <br>
    - Spain <br>
    
> **West_E   &#8594;**  The selected countries have data in all the datasets<br>
    - Austria <br>
    - France <br> 
    - Germany <br>
    - Netherlands <br>
    - Belgium <br>
    
### We will now normalise the code of the country in our datasets.

In [None]:
countrygrp = df_D4.groupby(['geo'])
onecountrydf = countrygrp.get_group('FR')
onecountrydf

In [None]:
countryCode = {'AT':'Austria','BE':'Belgium','BG':'Bulgaria','CY':'Cyprus','CZ':'Czechia','DE':'Germany','DK':'Denmark','EE':'Estonia','EL':'Greece',
               'ES':'Spain','FI':'Finland','FR':'France','HR':'Croatia','HU':'Hungary','IE':'Ireland','IT':'Italy','LT':'Lithuania','LU':'Luxembourg',
               'LV':'Latvia','MT':'Malta','NL':'Netherlands','PL':'Poland','PT':'Portugal','RO':'Romania','SE':'Sweden','SI':'Slovenia','SK':'Slovakia',
               'UK':'United Kingdom'}

slctCountry = ['Estonia','Sweden','Finland','Denmark','Ireland',
               'Bulgaria','Hungary','Poland','Romania','Slovakia',
               'Slovenia','Greece','Italy','Portugal','Spain',
               'Austria','France','Germany','Netherlands','Belgium']

In [None]:
#before replacement, we need to drop the EU27 and EU28
# Delete rows with 'EU27_2020' and `EU28` in 'geo' column
df_D4 = df_D4[df_D4['geo'] != 'EU27_2020']
df_D4 = df_D4[df_D4['geo'] != 'EU28']
df_D3

In [None]:
#replacing the name of countries in dfs to have same name in all datasets
df_D3 = replaceValue(df_D3,'geo',countryCode)
df_D4 = replaceValue(df_D4,'geo',countryCode)
df_D5 = replaceValue(df_D5,'geo',countryCode)
df_D7 = replaceValue(df_D7,'geo',countryCode)

df_D3

In [None]:
#removing the unwanted countries from the datasets
df_D1_1 = countryRem(df_D1_1,slctCountry,'Country')
dfte11 = df_D1_1.groupby(['Country'])

df_D1_21 = countryRem(df_D1_21,slctCountry,'Country')
dfte121 = df_D1_21.groupby(['Country'])   #gets 13 countries  !!!CHECK SPELLINGS, but we get everything in the d1 total?
df_D1_22 = countryRem(df_D1_22,slctCountry,'Country')
dfte122 = df_D1_22.groupby(['Country'])  #gets 17 countries
df_D1_23 = countryRem(df_D1_23,slctCountry,'Country')
dfte123 = df_D1_23.groupby(['Country'])  #gets 16 countries 
df_D1_24 = countryRem(df_D1_24,slctCountry,'Country')
dfte124 = df_D1_24.groupby(['Country'])  #gets 16 countries 

df_D2_10 = countryRem(df_D2_10,slctCountry,'Country')
dfte210 = df_D2_10.groupby(['Country']) #gets 20 countries because of badly strcutured df
df_D2_11 = countryRem(df_D2_11,slctCountry,'Country')
dfte212 = df_D2_12.groupby(['Country']) #gets 20 countries because of badly strcutured df
df_D2_12 = countryRem(df_D2_12,slctCountry,'Country')
dfte212 = df_D2_12.groupby(['Country']) #gets 20 countries because of badly strcutured df
df_D2_21 = countryRem(df_D2_21,slctCountry,'Country')
dfte221 = df_D2_21.groupby(['Country']) #gets 8 countries 
df_D2_22 = countryRem(df_D2_22,slctCountry,'Country')
dfte222 = df_D2_22.groupby(['Country']) #gets 8 countries 
df_D2_23 = countryRem(df_D2_23,slctCountry,'Country')
dfte223 = df_D2_23.groupby(['Country']) #gets 8 countries 


df_D3 = countryRem(df_D3,slctCountry,'geo')
df_D3 = df_D3.rename(columns={'geo': 'Country'})
dfte3 = df_D3.groupby(['Country'])

df_D4 = countryRem(df_D4,slctCountry,'geo')
df_D4 = df_D4.rename(columns={'geo': 'Country'})
dfte4 = df_D4.groupby(['Country'])

df_D5 = countryRem(df_D5,slctCountry,'geo')
df_D5 = df_D5.rename(columns={'geo': 'Country'})
dfte5 = df_D5.groupby(['Country'])

df_D7 = countryRem(df_D7,slctCountry,'geo')
df_D7 = df_D7.rename(columns={'geo': 'Country'})
dfte7 = df_D7.groupby(['Country'])

#loop for counting the number of countries in the resulting df after filter
count =  0
for item in dfte221.groups:
    count +=1 
print(count)

In [None]:
#now sub-dividing the dataframes with multiple categories of values.
# FOR D3
D3grp = df_D3.groupby('indic_cr')
    
    #PRIS_ACT_CAP
    #PRIS_OFF_CAP
df_D3_1 = D3grp.get_group('PRIS_OFF_CAP')
df_D3_2 = D3grp.get_group('PRIS_ACT_CAP')

df_D3_1 = countryRem(df_D3_1,slctCountry,'Country')
df_D3_2 = countryRem(df_D3_2,slctCountry,'Country')
    #now renaming the indic_cr to Category
df_D3_1 = df_D3_1.rename(columns={'indic_cr': 'Category'})
df_D3_2 = df_D3_2.rename(columns={'indic_cr': 'Category'})
df_D3_2

# FOR D4
D4grp = df_D4.groupby('lev_per')

    #FBAD
    #FGOOD
    #UNK
    #VBAD
    #VB_FB
    #VGOOD
    #VG_FG
df_D4_1 = D4grp.get_group('FBAD')
df_D4_2 = D4grp.get_group('FGOOD')
df_D4_3 = D4grp.get_group('UNK')
df_D4_4 = D4grp.get_group('VBAD')
df_D4_5 = D4grp.get_group('VGOOD')

df_D4_1 = df_D4_1.rename(columns={'lev_per':'Category'})
df_D4_2 = df_D4_2.rename(columns={'lev_per':'Category'})
df_D4_3 = df_D4_3.rename(columns={'lev_per':'Category'})
df_D4_4 = df_D4_4.rename(columns={'lev_per':'Category'})
df_D4_5 = df_D4_5.rename(columns={'lev_per':'Category'})

# FOR D5
# D5 contains only 1 category of values which is NR which is the corruption index of country , we will just rename the column it has values in as `Category`

df_D5 = df_D5.rename(columns={'unit':'Category'})

# FOR D7
D7grp = df_D7.groupby('cofog99')

    #GF03
    #GF0301
    #GF0303
    #GF0304
    #GF0305
    #GF0306
    
df_D7_1 = D7grp.get_group('GF03')
df_D7_2 = D7grp.get_group('GF0301')
df_D7_3 = D7grp.get_group('GF0303')
df_D7_4 = D7grp.get_group('GF0304')
df_D7_5 = D7grp.get_group('GF0305')
df_D7_6 = D7grp.get_group('GF0306')

    #now renaming the `cofog99` to `Category`
df_D7_1 = df_D7_1.rename(columns={'cofog99':'Category'}) 
df_D7_2 = df_D7_2.rename(columns={'cofog99':'Category'})  
df_D7_3 = df_D7_3.rename(columns={'cofog99':'Category'})  
df_D7_4 = df_D7_4.rename(columns={'cofog99':'Category'})  
df_D7_5 = df_D7_5.rename(columns={'cofog99':'Category'})  
df_D7_6 = df_D7_6.rename(columns={'cofog99':'Category'})  

   #now renaming the `Indicator` to `Category` for D2 dataframes
    
df_D2_10 = df_D2_10.rename(columns={'Indicator':'Category'})
df_D2_11 = df_D2_11.rename(columns={'Indicator':'Category'})
df_D2_12 = df_D2_12.rename(columns={'Indicator':'Category'})


In [None]:
# replacing values of a value in column
def replace_value(df, col_name, value_str, replacement_str):
    df[col_name] = df[col_name].replace(value_str, replacement_str)
    return df

df_D1_1 = replace_value(df_D1_1,'Category','Total','Total deaths')

df_D2_10 = replace_value(df_D2_10,'Category','Persons held','Total persons held')
df_D2_11 = replace_value(df_D2_11,'Category','Persons held','Male persons held')
df_D2_12 = replace_value(df_D2_12,'Category','Persons held','Female persons held')


In [None]:
df_D1_1.head(2)
#print(df_D4_1.loc[1,'Category'])

In [None]:
col_unique_val(df_D7_1,'unit')

In [None]:
# checking if there are space characters in the dfs
for item in df_D3_1.columns:
    print(item)
    if ' ' in item:
        print(True)
        
        print(type(item))

<hr>

## Final Dataset description

After cleaning and filtering the source data according to the needs of the project,the results can be fetched and mashed together using the helper functions designed to generate the custom dataframe/s required for specific analysis between different variables.

Given below are the names of different columns in different datasets and what kind of values and variable they hold.

> What are the values held in D1 datasets?

| DF Name | variable| value type |
|-------------|----------|----------|
|   df_D1_1   | Total deaths                                   |   int - count   |
|   df_D1_21  | Deaths due to external causes: by accident or other causes  |   int - count   |
|   df_D1_22  | Deaths due to external causes: by intentional homicide      |   int - count   |
|   df_D1_23  | Deaths due to external causes: by suicide                   |   int - count   |
|   df_D1_24  | Deaths due to natural causes                                |   int - count   |

> What are the values held in D2 datasets?

| DF Name | variable | value type |
|-------------|----------|----------|
|   df_D2_10  | Total persons held   |   int - count    |
|   df_D2_11  | Male persons held    |   int - count    |
|   df_D2_12  | Female persons held  |   int - count    |
|   df_D2_21  | Unsentenced for less than 12 months              |   int - count    |
|   df_D2_22  | Unsentenced for less than 6 months               |   int - count    |
|   df_D2_23  | Unsentenced for more than 12 months              |   int - count    |
|   df_D2_24  | Unsentenced: Total                               |   int - count    |

> What are the values held in D3 datasets?

| DF Name | variable | value type |
|-------------|----------|----------|
|   df_D3_1  | PRIS_OFF_CAP   |   int - count/100,000 inhabitants    |
|   df_D3_2  | PRIS_ACT_CAP   |   int - count/100,000 inhabitants    |

> What are the values held in D4 datasets? <br>
This dataset gives the perceptoion of people towards the justice system in percentage

| DF Name | variable | value type |
|-------------|----------|----------|
|   df_D4_1  | FBAD   |   int - percentage   |
|   df_D4_2  | FGOOD  |   int - percentage   |
|   df_D4_3  | UNK    |   int - percentage   |
|   df_D4_4  | VBAD   |   int - percentage   |
|   df_D4_5  | VGOOD  |   int - percentage   |

> What are the values held in D5 datasets?<br>
<span style='color:green'>the name NR stands for national ranking in terms of corruption of the justice system.</span>

| DF Name | variable | value type |
|-------------|----------|----------|
|   df_D5    | NR  |   int - percentage   |

>  GEO <br> VALUE/YEAR <br> <span style='color:green'>_The indicator(value) is a composite index based on a combination of surveys and assessments of corruption from 13 different sources and scores and ranks countries based on how corrupt a country’s public sector is perceived to be, with a score of 0 representing a very high level of corruption and a score of 100 representing a very clean country._ </span>

<br>

**surprise surprise ! There is no D6** 

> What are the values held in D7 datasets? <br>
<span style='color:green'>This dataset provided the goverment expenditure in different areas. The expenditure values can be acceesed using the expenditure type code provided in the table. To know the meaning of the code, see the documention Notebook.</span>

| DF Name | variable | value type |
|-------------|----------|----------|
|   df_D7_1  | GF03     |   int - percentage of GDP   |
|   df_D7_2  | GF0301   |   int - percentage of GDP   |
|   df_D7_3  | GF0303   |   int - percentage of GDP   |
|   df_D7_4  | GF0304   |   int - percentage of GDP   |
|   df_D7_5  | GF0305   |   int - percentage of GDP   |
|   df_D7_6  | GF0306   |   int - percentage of GDP   |

<hr>

In [None]:
#defining the function for the making the multilevel index
def midx_creator(country_name,country_datatype):
    # country_name should be a string
    # country_datatype should be a list of strings
    code1 = []
    code2 = []
    for i in range(len(country_datatype)):
        code1.append(0)
        code2.append(i)
        
    midx = pd.MultiIndex(levels=[[country_name],
                                  country_datatype],
                         codes = [code1,
                                  code2])
    return midx

def data_creator(df_list,country_name,country_datatype):
    # WIP
    # df_list needs to be a list
    data = []
    columns = ['2016','2017','2018','2019','2020']
    return_datatype = []
    for df in df_list:
        df = df.reset_index(drop=True)
        #print(df.head(1))
        for dtype in country_datatype:
            if dtype  == df.loc[1, 'Category']:  # the actual string value we are interested in 
                # IMPORTANT !!! we can create the country_datatype list in this loop so it will always be 
                # the same order as the list of list of datas
                # BUT! if the country doesnt exist in the df? we need to now check if the country has data
                # in the current df.
                if country_name in df['Country'].values:
                    return_datatype.append(dtype)
                
                    # select specific columns from the row where country is available
                    row_values = df.loc[df['Country'] == country_name, columns].values.tolist()[0]
                    data.append(row_values) 
            else:
                continue
                    
    new_df = pd.DataFrame(index=midx_creator(country_name,return_datatype),
                          columns=['2016','2017','2018','2019','2020'],
                          data = data)
    #print(return_datatype)
    #print(row_values)
    return new_df

In [None]:
#writing a function to take in multiple countries as input and the list of categories we are interested in from the user.

def masher(countries,categories,df_list):
    #countries need to be a listitem
    resultDF = pd.DataFrame()
    for countryName in countries:
        tempDF = data_creator(df_list,countryName,categories)
        resultDF = pd.concat([resultDF,tempDF])
    
    return resultDF

## <span style='color:silver;'>Part I</span>

We created the dataframe for individual countries which we plan on analysing which includes all the different types of data we have from different sources.

In [None]:
#making the df_list
df_D3_1 = spaceRemover(df_D3_1)
df_D3_2 = spaceRemover(df_D3_2)

df_D4_1 = spaceRemover(df_D4_1)
df_D4_2 = spaceRemover(df_D4_2)
df_D4_3 = spaceRemover(df_D4_3)
df_D4_4 = spaceRemover(df_D4_4)
df_D4_5 = spaceRemover(df_D4_5)

df_D5 = spaceRemover(df_D5)

df_D7_1 = spaceRemover(df_D7_1)
df_D7_2 = spaceRemover(df_D7_2)
df_D7_3 = spaceRemover(df_D7_3)
df_D7_4 = spaceRemover(df_D7_4)
df_D7_5 = spaceRemover(df_D7_5)
df_D7_6 = spaceRemover(df_D7_6)

dfList = [df_D1_1,
          df_D1_21,
          df_D1_22,
          df_D1_23,
          df_D1_24,
          df_D2_10,
          df_D2_11,
          df_D2_12,
          df_D2_21,
          df_D2_22,
          df_D2_23,
          df_D2_24,
          df_D3_1,
          df_D3_2,
          df_D4_1,
          df_D4_2,
          df_D4_3,
          df_D4_4,
          df_D4_5,
          df_D5,
          df_D7_1,
          df_D7_2,
          df_D7_3,
          df_D7_4,
          df_D7_5,
          df_D7_6,
          ]
dfD1clean=[df_D1_1,df_D1_21,df_D1_22,df_D1_23,df_D1_24]
dfD2clean = [df_D2_10,df_D2_11,df_D2_12,df_D2_21,df_D2_22,df_D2_23,df_D2_24]
dfD3clean = [df_D3_1,df_D3_2,]
dfD4clean = [df_D4_1,df_D4_2]
dfD5clean = [df_D5]
dfD6clean = [df_D7_1,df_D7_2,df_D7_3,df_D7_4,df_D7_5,df_D7_6,]
country_datatype = ['Total deaths',
                    'Deaths due to external causes: by accident or other causes',
                    'Deaths due to external causes: by intentional homicide',
                    'Deaths due to external causes: by suicide',
                    'Deaths due to natural causes',
                    
                    'Total persons held',
                    'Male persons held',
                    'Female persons held',
                    'Unsentenced for less than 6 months',
                    'Unsentenced for less than 12 months',
                    'Unsentenced for more than 12 months',
                    'Unsentenced: Total',
                    
                    'PRIS_OFF_CAP',
                    'PRIS_ACT_CAP',
                    
                    'FBAD',
                    'FGOOD',
                    'UNK',
                    'VBAD',
                    'VGOOD',
                    
                    'NR',
                    
                    'GF03',
                    'GF0301',
                    'GF0303',
                    'GF0304',
                    'GF0305',
                    'GF0306']

In [None]:
country_lst = ['Estonia','Sweden','Finland','Denmark','Ireland','Bulgaria','Hungary','Poland','Romania','Slovakia','Slovenia','Greece','Italy','Portugal','Spain','Austria','France','Germany','Netherlands','Belgium']
df_1 = masher(country_lst,country_datatype,dfD1clean)
df_2 = masher(country_lst,country_datatype,dfD2clean)
df_3 = masher(country_lst,country_datatype,dfD3clean)
df_4 = masher(country_lst,country_datatype,dfD4clean)
df_5 = masher(country_lst,country_datatype,dfD5clean)
df_6 = masher(country_lst,country_datatype,dfD6clean)

In [None]:
#convert datatype
def convert_Dtype(Df):
    Df = Df.astype(int)
    
    return Df

# a senitnal value is used to replace all the ':' strings which denote missing values in our dataset. The choice of sentinal value is done based on the type
# data we are working with and a negative value of 999 is distinct and non-feasable for our variable to achieve in realistic scenarious. 
sentinal_value = -999
df_1.replace(":", sentinal_value, inplace=True)

df_1 = convert_Dtype(df_1)
print(df_1.dtypes)

In [None]:
df_2.replace(":", sentinal_value, inplace=True)
df_2 = convert_Dtype(df_2)
print(df_1.dtypes)
df_2

In [None]:
def check_space_chars(df):
    """
    Check if any value in a dataframe contains a space character at the beginning or end of it.

    Parameters:
    df (pandas.DataFrame): The dataframe to check.

    Returns:
    bool: True if any value in the dataframe contains a space character at the beginning or end of it, False otherwise.
    """
    for col in df.columns:
        for value in df[col]:
            if isinstance(value, str) and (value.startswith(' ') or value.endswith(' ')):
                print("this is true")
                return True
            "print this is false"
    return False

#check_space_chars(df_3)
df_3_copy = df_3

def remove_space_chars(df):
    """
    Remove any space characters at the beginning or end of a value in a dataframe.

    Parameters:
    df (pandas.DataFrame): The dataframe to modify.

    Returns:
    pandas.DataFrame: The modified dataframe with space characters removed.
    """
    for col in df.columns:
        if df[col].dtype == 'object':
            df[col] = df[col].str.strip()
    return df

df_3_copy = remove_space_chars(df_3_copy)
check_space_chars(df_3_copy)

df_3.replace(np.nan, sentinal_value, inplace=True)

#df_2 = convert_Dtype(df_2)
#print(df_1.dtypes)
df_3

In [None]:
df_4.replace(":", sentinal_value, inplace=True)
df_4 = convert_Dtype(df_4)
print(df_4.dtypes)
df_4

In [None]:
df_5.replace(":", sentinal_value, inplace=True)
df_5 = convert_Dtype(df_5)
print(df_5.dtypes)
df_5

In [None]:
print(df_6.dtypes)
print(df_6)

#df_6 = df_6.astype(float)
#print(df_6.dtypes)

## <span style='color:silver;'>Part II</span>

We will now convert the mashup dataframe into a json file for to be used in the visualisation using `AmCharts.js` for deploying on the web.

## JSON converter

In [None]:
import json

def df_to_json(dataframe):
    """
    Convert a Pandas DataFrame to a JSON object.
    """
    json_str = dataframe.to_json(orient='table')  # there are many possible orientations like `records`, `split` and `table` -  default = `index`
    json_obj = json.loads(json_str)
    return json_obj

In [None]:
#test the above function
testJS = json.dumps(df_to_json(dfFINE), indent=4)
print(testJS)

In [None]:
#exporting the json file 
with open('data.json', 'a') as f:
    f.write(json.dumps(testJS, ensure_ascii=False, indent=4,sort_keys=True))

In [None]:
#exporting as CSV
df_1.to_csv('D1_deaths_in_prison_clean.csv',sep=',',encoding='utf-8')
df_2.to_csv('D2_sentance_status_clean.csv',sep=',',encoding='utf-8')
df_3.to_csv('D3_prison_capacity_clean.csv',sep=',',encoding='utf-8')
df_4.to_csv('D4_perceived_independence_of_the_justice_system_clean.csv',sep=',',encoding='utf-8')
df_5.to_csv('D5_CPI_from_eurostat_clean.csv',sep=',',encoding='utf-8')
df_6.to_csv('D6_general_governments_expenditure_clean.csv',sep=',',encoding='utf-8')

## ROUGH WORK
<hr>

In [None]:
# create a sample dataframe with a multi-index
arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
          ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]
tuples = list(zip(*arrays))
index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])
df = pd.DataFrame({'A': [1, 2, 3, 4, 5, 6, 7, 8],
                   'B': [10, 20, 30, 40, 50, 60, 70, 80],
                   'C': [100, 200, 300, 400, 500, 600, 700, 800]},
                  index=index)

# print the original dataframe
print(df)

# select data using levels
'''
print(df.loc[('bar', 'one')])  # select data for bar and one
print(df.loc['foo'])  # select all data for foo
print(df.loc[:, 'B'])  # select all data for B column
print(df.loc[('baz', 'two'), 'C'])  # select data for baz, two and C column
'''

midx = pd.MultiIndex(levels=[['first level values','frist_level_2'],
                            ['second level v1', 'second level v2']],
                     codes=[[0,0,1,1],  #this is first level codes 
                           [0,1,0,1]])  #this is second level codes 
df = pd.DataFrame(index=midx,
                  columns=['year1','year2','year3'],
                  data = [[1,2,3],
                          [4,5,6],
                          [1,2,3],
                          [4,5,6]])
df.loc[('first level values')]
df

df.loc[('first level values','second level v2')]
for col,value in df.loc[('first level values','second level v2')].items():
    print(col + 'has value ' + str(value))

# create a list of tuples for each level of the index
index_levels = [
    ['Group A', 'Group A', 'Group B', 'Group B'],
    ['Feature 1', 'Feature 2', 'Feature 1', 'Feature 2']
]

# create a list of codes for each level of the index
index_codes = [
    [0, 1, 0, 1],
    [0, 1, 0, 1]
]

# create the multi-level index using the levels and codes
index = pd.MultiIndex.from_tuples(list(zip(*index_levels)), names=['Group', 'Feature'])
index.codes = index_codes

# create a dataframe with the multi-level index
df = pd.DataFrame({'Values': [1, 2, 3, 4]}, index=index)

print(df)

In [None]:

# create a sample dataframe
df = pd.DataFrame({'col1': [1, 2, 3], 'col2': ['a', 'b', 'c'], 'col3': [True, False, True]})
print(df)

# select specific columns from the row where col3 is True
row_values = df.loc[df['col3'] == True, ['col1', 'col2']].values.tolist()[0]
'''
we get a new DataFrame that contains only the rows where the value of col3 is True. Since there may be more than one row that satisfies the condition, 
the result is still a DataFrame, not a single row. So we specify '[0]' to get only the first row
'''

# print the row values
print(row_values)


In [None]:
#prova
midx = pd.MultiIndex(levels=[slctCountry,
                            ['second level v1', 'second level v2','second level v3']],
                     codes=[[0,0,0,1,1,1],
                           [0,1,2,0,1,2]])
df = pd.DataFrame(index=midx,
                  columns=['2016','2017','2018','2019','2020'],
                  data = [[1,2,3,4,5],
                          [1,22,3,4,5],
                          [1,23,3,4,5],
                          [1,2,3,4,5],
                          [1,22,3,4,5],
                          [1,23,3,4,5]])

midx2 = pd.MultiIndex(levels=[slctCountry,
                            ['second level v1', 'second level v2','second level v3']],
                     codes=[[0,0,0,1,1,1],
                           [0,1,2,0,1,2]])
df2 = pd.DataFrame(index=midx,
                  columns=['2016','2017','2018','2019','2020'],
                  data = [[1,2,3,4,5],
                          [1,22,3,4,5],
                          [1,23,3,4,5],
                          [1,2,3,4,5],
                          [1,22,3,4,5],
                          [1,23,3,4,5]])

dfmergeTest = pd.concat([df,df2])
dfmergeTest

In [None]:
# We will we merging the required values from the D7/D4 & D5 datasets.
