# More Data Wrangling 3

In this notebook I demonstrate how to import data that is recorded on two separate csv and combine the data row-wise into a single pandas dataframe object.  

In [1]:
import pandas as pd

import numpy as np

In [2]:
# Build a data frame from multiple files (row-wise)

# Reading in the dissertation data from two years currently on two separate csv files. 
# First the 2020 - 21 data file:

pd.read_csv('Dissertation Data 2020_21.csv')

Unnamed: 0,Participant Public ID,Sex,Sex-quantised,Age,Postcode,IMD Score,Crime Rank,CFC Q1,CFC Q1-quantised,CFC Q2,...,CFC Q12,CFC Q12-quantised,Q12 Reversed,CFC Mean Score,PGG Round 1,PGG Round 2,PGG Round 3,PGG Round 4,PGG Round 5,PGG Mean Contribution
0,1634,Female,2,33,ig1 2bh,6300.0,14579.0,3,3,3,...,3,3,3,3.000000,5,5,5,5,5,5.0
1,3838,Female,2,23,N127NL,16989.0,10309.0,4,4,3,...,3,3,3,2.583333,1,9,1,1,9,4.2
2,3961,Male,1,21,rg41 1hr,32540.0,29044.0,3,3,3,...,3,3,3,3.000000,2,0,0,1,6,1.8
3,4041,Male,1,22,EN5 2PA,6697.0,6270.0,5,5,3,...,3,3,3,3.000000,1,1,2,3,0,1.4
4,3830,Female,2,34,gl22ey,19017.0,13872.0,5,5,4,...,2,2,4,2.750000,1,5,5,5,5,4.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75,3990,Female,2,19,lu1,,,4,4,4,...,2,2,4,2.833333,10,2,10,10,10,8.4
76,3960,Female,2,19,IG1 4HS,12806.0,5345.0,4,4,4,...,5,5,1,3.750000,6,6,4,7,6,5.8
77,4000,Female,2,20,UB8 3EY,18114.0,7866.0,4,4,3,...,3,3,3,3.000000,6,7,8,9,10,8.0
78,3955,Female,2,18,DN16 2AQ,14174.0,6476.0,3,3,3,...,3,3,3,2.583333,7,9,10,10,7,8.6


In [3]:
# Now the second data file:

pd.read_csv('Dissertation Data 2021_22.csv')

Unnamed: 0,Participant Public ID,Sex,Sex-quantised,Age,Postcode,IMD Score,Crime Rank,CFC Q1,CFC Q1-quantised,CFC Q2,...,CFC Q12,CFC Q12-quantised,Q12 Reversed,CFC Mean Score,PGG Round 1,PGG Round 2,PGG Round 3,PGG Round 4,PGG Round 5,PGG Mean Contribution
0,2917,Male,1,45,YO15 1BA,17551.0,19572.0,5,5,5,...,2,2,4,4.750000,5,1,1,0,0,1.4
1,4095,Male,1,18,HP136TB,13718.0,10275.0,4,4,4,...,4,4,2,3.250000,10,3,10,10,10,8.6
2,4128,Female,2,19,NW9 4DW,16181.0,22880.0,3,3,2,...,3,3,3,3.500000,5,8,6,8,10,7.4
3,4136,Female,2,22,NW4 4XG,14717.0,15563.0,5,5,5,...,2,2,4,3.750000,0,1,2,2,3,1.6
4,4231,Male,1,35,en1 4nb,15675.0,6379.0,4,4,3,...,3,3,3,3.666667,10,10,8,10,8,9.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61,4093,Female,2,21,NW10 0ST,2954.0,3617.0,5,5,4,...,5,5,1,2.833333,10,2,5,10,7,6.8
62,4248,Female,2,23,EN3 6SG,9085.0,14362.0,4,4,2,...,5,5,1,2.916667,8,9,10,10,8,9.0
63,4103,Female,2,18,ub5 4jy,15589.0,17009.0,4,4,3,...,4,4,2,2.750000,5,6,10,5,10,7.2
64,4240,Female,2,19,EC1R 3AY,19372.0,23163.0,5,5,5,...,1,1,5,4.000000,5,5,5,3,6,4.8


In [4]:
# To combine these row-wise we can use the built in glob module.
# The glob module finds all the pathnames matching a specified pattern. 

from glob import glob

In [5]:
diss_data = sorted(glob('Dissertation*.csv'))

diss_data

['Dissertation Data 2020_21.csv', 'Dissertation Data 2021_22.csv']

In [6]:
# Now concatenating the two files:

pd.concat((pd.read_csv(file) for file in diss_data))

Unnamed: 0,Participant Public ID,Sex,Sex-quantised,Age,Postcode,IMD Score,Crime Rank,CFC Q1,CFC Q1-quantised,CFC Q2,...,CFC Q12,CFC Q12-quantised,Q12 Reversed,CFC Mean Score,PGG Round 1,PGG Round 2,PGG Round 3,PGG Round 4,PGG Round 5,PGG Mean Contribution
0,1634,Female,2,33,ig1 2bh,6300.0,14579.0,3,3,3,...,3,3,3,3.000000,5,5,5,5,5,5.0
1,3838,Female,2,23,N127NL,16989.0,10309.0,4,4,3,...,3,3,3,2.583333,1,9,1,1,9,4.2
2,3961,Male,1,21,rg41 1hr,32540.0,29044.0,3,3,3,...,3,3,3,3.000000,2,0,0,1,6,1.8
3,4041,Male,1,22,EN5 2PA,6697.0,6270.0,5,5,3,...,3,3,3,3.000000,1,1,2,3,0,1.4
4,3830,Female,2,34,gl22ey,19017.0,13872.0,5,5,4,...,2,2,4,2.750000,1,5,5,5,5,4.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61,4093,Female,2,21,NW10 0ST,2954.0,3617.0,5,5,4,...,5,5,1,2.833333,10,2,5,10,7,6.8
62,4248,Female,2,23,EN3 6SG,9085.0,14362.0,4,4,2,...,5,5,1,2.916667,8,9,10,10,8,9.0
63,4103,Female,2,18,ub5 4jy,15589.0,17009.0,4,4,3,...,4,4,2,2.750000,5,6,10,5,10,7.2
64,4240,Female,2,19,EC1R 3AY,19372.0,23163.0,5,5,5,...,1,1,5,4.000000,5,5,5,3,6,4.8


In [7]:
# The above method combines the files but keeps the original index for each file. 
# To reset the index we can add a parameter to ignore the index:

pd.concat((pd.read_csv(file) for file in diss_data), ignore_index = True)

Unnamed: 0,Participant Public ID,Sex,Sex-quantised,Age,Postcode,IMD Score,Crime Rank,CFC Q1,CFC Q1-quantised,CFC Q2,...,CFC Q12,CFC Q12-quantised,Q12 Reversed,CFC Mean Score,PGG Round 1,PGG Round 2,PGG Round 3,PGG Round 4,PGG Round 5,PGG Mean Contribution
0,1634,Female,2,33,ig1 2bh,6300.0,14579.0,3,3,3,...,3,3,3,3.000000,5,5,5,5,5,5.0
1,3838,Female,2,23,N127NL,16989.0,10309.0,4,4,3,...,3,3,3,2.583333,1,9,1,1,9,4.2
2,3961,Male,1,21,rg41 1hr,32540.0,29044.0,3,3,3,...,3,3,3,3.000000,2,0,0,1,6,1.8
3,4041,Male,1,22,EN5 2PA,6697.0,6270.0,5,5,3,...,3,3,3,3.000000,1,1,2,3,0,1.4
4,3830,Female,2,34,gl22ey,19017.0,13872.0,5,5,4,...,2,2,4,2.750000,1,5,5,5,5,4.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
141,4093,Female,2,21,NW10 0ST,2954.0,3617.0,5,5,4,...,5,5,1,2.833333,10,2,5,10,7,6.8
142,4248,Female,2,23,EN3 6SG,9085.0,14362.0,4,4,2,...,5,5,1,2.916667,8,9,10,10,8,9.0
143,4103,Female,2,18,ub5 4jy,15589.0,17009.0,4,4,3,...,4,4,2,2.750000,5,6,10,5,10,7.2
144,4240,Female,2,19,EC1R 3AY,19372.0,23163.0,5,5,5,...,1,1,5,4.000000,5,5,5,3,6,4.8
