# Dataset Mashup

In [317]:
import pandas as pd

In [322]:
# importing files
# D1_1 -> Total mortality in prison
df_D1_1 = pd.read_csv('processed_data/D1.1.csv',sep=';')
# D1_2x -> Death with reason 1/2/3/4
df_D1_21 = pd.read_csv('processed_data/D1.21.csv',sep=';')
df_D1_22 = pd.read_csv('processed_data/D1.22.csv',sep=';')
df_D1_23 = pd.read_csv('processed_data/D1.23.csv',sep=';')
df_D1_24 = pd.read_csv('processed_data/D1.24.csv',sep=';')

# D2_1x -> People held in prison total/Male/Female
df_D2_10 = pd.read_csv('processed_data/D2.10.csv',sep=';')
df_D2_11 = pd.read_csv('processed_data/D2.11.csv',sep=';')
df_D2_12 = pd.read_csv('processed_data/D2.12.csv',sep=';')
# D2_2x -> People held unsentenced with timespan 1/2/3/4
df_D2_21 = pd.read_csv('processed_data/D2.21.csv',sep=';')
df_D2_22 = pd.read_csv('processed_data/D2.22.csv',sep=';')
df_D2_23 = pd.read_csv('processed_data/D2.23.csv',sep=';')
df_D2_24 = pd.read_csv('processed_data/D2.24.csv',sep=';')

# D3 -> Official prison capacity per 100 thousand inhabitants and the actual population of the prison
df_D3 = pd.read_csv('processed_data/D3.csv',sep=';')

# D4 -> Percieved perception of the justice system 
df_D4 = pd.read_csv('processed_data/D4.csv',sep=';')

# D5 -> Corruption perception index (unit scale 0 - 100 where 0 being `highly corrupt`)
df_D5 = pd.read_csv('processed_data/D5.csv',sep=';')

# D7 ->  General Government Sector's annual expenditure on 
#        [GF03] Public order and safety 
#        [GF0301] Police services 
#        [GF0303] Law courts 
#        [GF0304] Prisons
#        [GF0305] R&D Public order and safety 
#        [GF0306] Public order and safety n.e.c.
df_D7 = pd.read_csv('processed_data/D7.csv',sep=';')

df_D4.tail()

Unnamed: 0,lev_per,geo,2016,2017,2018,2019,2020,2021,2022
205,UNK,UK,12,13,12,11,8,:,:
206,VBAD,UK,6,5,7,9,9,:,:
207,VB_FB,UK,18,16,20,19,21,:,:
208,VGOOD,UK,18,26,20,21,17,:,:
209,VG_FG,UK,70,71,68,68,71,:,:


In [271]:
#defining some handy functions
def col_unique_val(df,colname):
    uniq_vals = []
    for index,row in df.iterrows():
        for col,value in row.items():
            if col == colname and value not in uniq_vals:
                uniq_vals.append(value)
    return uniq_vals

def row_values(df,row_index):
    values = df.loc[row_index].values
    return values

At this stage, we set the timespan we are going to be working with ->  `2016 - 2021` <br>
We also define the countries we are interested in for each sub-region of europe,for sake of simplicity, lets pick 5 countries from each sub-region. Please note that these allocation of any country to a subregions is `purely on the basis of their geography`.

In [272]:
# write the function to drop select columns from a df
def col_drop(df):
    years = ['2001','2002','2003','2004','2005','2006','2007','2008','2009','2010','2011','2012','2013','2014','2015','2022']
    to_keep = [col for col in df.columns if col.strip() not in years] #we use the strip to remove blank spaces around the value.
    df = df[to_keep]
    return df

In [321]:
#droping the non-required years for all the dfs
df_D1_1 = col_drop(df_D1_1)
df_D1_21 = col_drop(df_D1_21)
df_D1_22 = col_drop(df_D1_22)
df_D1_23 = col_drop(df_D1_23)
df_D1_23 = col_drop(df_D1_24)

df_D2_10 = col_drop(df_D2_10)
df_D2_11 = col_drop(df_D2_11)
df_D2_12 = col_drop(df_D2_12)
df_D2_21 = col_drop(df_D2_21)
df_D2_22 = col_drop(df_D2_22)
df_D2_23 = col_drop(df_D2_23)
df_D2_24 = col_drop(df_D2_24)

df_D3 = col_drop(df_D3)
df_D4 = col_drop(df_D4)
df_D5 = col_drop(df_D5)
df_D7 = col_drop(df_D7)

df_D3.head()

Unnamed: 0,indic_cr,geo,2016,2017,2018,2019,2020
0,PRIS_ACT_CAP,AL,208.97,197.25,185.21,176.25,162.12
1,PRIS_ACT_CAP,AT,99.18,100.90,103.86,102.41,95.36
2,PRIS_ACT_CAP,BA,:,:,:,:,:
3,PRIS_ACT_CAP,BE,:,92.24,90.02,92.17,90.09
4,PRIS_ACT_CAP,BG,102.67,98.40,94.34,92.11,89.92


Let's reduce the number of countries we are working with. Here is the list of 5 countries from each region of europe which we have choosen to include in our project.

> **North_E   &#8594;** The selected countries have data in all the datasets<br>
    - Estonia <br>
    - Sweden <br>
    - Finland <br>
    - Denmark <br>
    - UK _(combined -> UK + Scotland + Wales + Ireland)_ <br>
    
> **East_E   &#8594;** The selected countries have data in all the datasets<br>
    - Bulgaria <br>
    - Hungary <br>
    - Poland <br>
    - Romania <br>
    - Slovakia <br>
    
> **South_E   &#8594;** The selected countries have data in all the datasets<br>
    - Slovenia <br>
    - Greece <br>
    - Italy <br>
    - Portugal <br>
    - Spain <br>
    
> **West_E   &#8594;**  The selected countries have data in all the datasets<br>
    - Austria <br>
    - France <br> 
    - Germany <br>
    - Netherlands <br>
    - Belgium <br>
    
We will now normalise the code of the country in our datasets.

In [274]:
countrygrp = df_D4.groupby(['geo'])
onecountrydf = countrygrp.get_group('FR')
onecountrydf

Unnamed: 0,lev_per,geo,2016,2017,2018,2019,2020,2021
91,FBAD,FR,29,26,23,23,22,21
92,FGOOD,FR,49,50,52,53,51,52
93,UNK,FR,7,10,11,10,13,14
94,VBAD,FR,10,11,8,8,9,8
95,VB_FB,FR,39,37,31,31,31,29
96,VGOOD,FR,5,3,6,6,5,5
97,VG_FG,FR,54,53,58,59,56,57


In [275]:
countryCode = {'AT':'Austria','BE':'Belgium','BG':'Bulgaria','CY':'Cyprus','CZ':'Czechia','DE':'Germany','DK':'Denmark','EE':'Estonia','EL':'Greece',
               'ES':'Spain','FI':'Finland','FR':'France','HR':'Croatia','HU':'Hungary','IE':'Ireland','IT':'Italy','LT':'Lithuania','LU':'Luxembourg',
               'LV':'Latvia','MT':'Malta','NL':'Netherlands','PL':'Poland','PT':'Portugal','RO':'Romania','SE':'Sweden','SI':'Slovenia','SK':'Slovakia',
               'UK':'United Kingdom'}

In [305]:
#before replacement, we need to drop the EU27 and EU28
# Delete rows with 'EU27_2020' and `EU28` in 'geo' column
df_D4 = df_D4[df_D4['geo'] != 'EU27_2020']
df_D4 = df_D4[df_D4['geo'] != 'EU28']
df_D4

Unnamed: 0,lev_per,geo,2016,2017,2018,2019,2020,2021
0,FBAD,AT,13,14,7,8,7,7
1,FGOOD,AT,59,58,58,65,62,62
2,UNK,AT,7,6,10,7,5,7
3,VBAD,AT,3,2,2,2,2,2
4,VB_FB,AT,16,16,9,10,9,9
...,...,...,...,...,...,...,...,...
205,UNK,UK,12,13,12,11,8,:
206,VBAD,UK,6,5,7,9,9,:
207,VB_FB,UK,18,16,20,19,21,:
208,VGOOD,UK,18,26,20,21,17,:


In [323]:
countrygrp = df_D4.groupby(['geo'])
for item in countrygrp.groups:
    print(item)
    
countrygrp.get_group('FR')

AT
BE
BG
CY
CZ
DE
DK
EE
EL
ES
EU27_2020
EU28
FI
FR
HR
HU
IE
IT
LT
LU
LV
MT
NL
PL
PT
RO
SE
SI
SK
UK


Unnamed: 0,lev_per,geo,2016,2017,2018,2019,2020,2021,2022
91,FBAD,FR,29,26,23,23,22,21,21
92,FGOOD,FR,49,50,52,53,51,52,52
93,UNK,FR,7,10,11,10,13,14,15
94,VBAD,FR,10,11,8,8,9,8,9
95,VB_FB,FR,39,37,31,31,31,29,29
96,VGOOD,FR,5,3,6,6,5,5,4
97,VG_FG,FR,54,53,58,59,56,57,56


## <span style='color:silver;'>Part I</span>

`REASON` : <br>We created the following mashup of the datasets processed earlier to analyse the **Dynamics of goverenment's expenditure on maintaining prisons**.

What are the columns we need from D4?
>  LEV_PER <br> GEO <br> VALUE/YEAR

What are the columns we need from D5?
>  GEO <br> VALUE/YEAR <br> <span style='color:green'>_The indicator(value) is a composite index based on a combination of surveys and assessments of corruption from 13 different sources and scores and ranks countries based on how corrupt a country’s public sector is perceived to be, with a score of 0 representing a very high level of corruption and a score of 100 representing a very clean country._ </span>

What are the columns we need from D7?
>  COFOG99 <br> GEO <br> UNIT <br> VALUE/YEAR

In [25]:
# We will we merging the required values from the D7/D4 & D5 datasets.
df_D5.tail()

Unnamed: 0,unit,geo,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
36,NR,SI,61,57,58,60,61,61,60,60,60,57,56
37,NR,SK,46,47,50,51,51,50,50,50,49,52,53
38,NR,TR,49,50,45,42,41,40,41,39,40,38,36
39,NR,UK,74,76,78,81,81,82,80,77,77,78,73
40,NR,US,73,73,74,76,74,75,71,69,67,67,69


## <span style='color:silver;'>Part II</span>

`REASON` : <br>We created the following mashup of the datasets processed earlier to analyse the **Dynamics of prison conditions and perception of justice**.

In [None]:
# We will we merging the required values from the D1/D2/D3/D4 & D5 datasets.








Learning how to use the multi-index dfs

In [28]:
# create a sample dataframe with a multi-index
arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
          ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]
tuples = list(zip(*arrays))
index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])
df = pd.DataFrame({'A': [1, 2, 3, 4, 5, 6, 7, 8],
                   'B': [10, 20, 30, 40, 50, 60, 70, 80],
                   'C': [100, 200, 300, 400, 500, 600, 700, 800]},
                  index=index)

# print the original dataframe
print(df)

# select data using levels
'''
print(df.loc[('bar', 'one')])  # select data for bar and one
print(df.loc['foo'])  # select all data for foo
print(df.loc[:, 'B'])  # select all data for B column
print(df.loc[('baz', 'two'), 'C'])  # select data for baz, two and C column
'''

              A   B    C
first second            
bar   one     1  10  100
      two     2  20  200
baz   one     3  30  300
      two     4  40  400
foo   one     5  50  500
      two     6  60  600
qux   one     7  70  700
      two     8  80  800


"\nprint(df.loc[('bar', 'one')])  # select data for bar and one\nprint(df.loc['foo'])  # select all data for foo\nprint(df.loc[:, 'B'])  # select all data for B column\nprint(df.loc[('baz', 'two'), 'C'])  # select data for baz, two and C column\n"

In [39]:
midx = pd.MultiIndex(levels=[['first level values'],
                            ['second level v1', 'second level v2']],
                     codes=[[0,0],
                           [0,1]])
df = pd.DataFrame(index=midx,
                  columns=['year1','year2','year3'],
                  data = [[1,2,3],[4,5,6]])
df.loc[('first level values')]

Unnamed: 0,year1,year2,year3
second level v1,1,2,3
second level v2,4,5,6


In [46]:
df.loc[('first level values','second level v2')]
for col,value in df.loc[('first level values','second level v2')].items():
    print(col + 'has value ' + str(value))

year1has value 4
year2has value 5
year3has value 6
