# More Data Wrangling 2

In this notebook I will demonstrate how to select different data types from a dataframe using include and exclude, and how to only import variables of interest directly from a csv file. In addition, I will give examples of how to select slices of rows and columns when using the describe method. 

In [1]:
import pandas as pd

import numpy as np

In [2]:
pgg = pd.read_csv('PGG Combined Data 2021 & 2022.csv')

In [3]:
pgg.head()

Unnamed: 0,Year,Year Quant,Participant Public ID,Sex,Sex-quantised,Age,Postcode,IMD Score,Crime Rank,CFC Q1,...,CFC Q12,CFC Q12-quantised,Q12 Reversed,CFC Mean Score,PGG Round 1,PGG Round 2,PGG Round 3,PGG Round 4,PGG Round 5,PGG Mean Contribution
0,2021,0,1634,Female,2,33,ig1 2bh,6300.0,14579.0,3,...,3,3,3,3.0,5,5,5,5,5,5.0
1,2021,0,3838,Female,2,23,N127NL,16989.0,10309.0,4,...,3,3,3,2.583333,1,9,1,1,9,4.2
2,2021,0,3961,Male,1,21,rg41 1hr,32540.0,29044.0,3,...,3,3,3,3.0,2,0,0,1,6,1.8
3,2021,0,4041,Male,1,22,EN5 2PA,6697.0,6270.0,5,...,3,3,3,3.0,1,1,2,3,0,1.4
4,2021,0,3830,Female,2,34,gl22ey,19017.0,13872.0,5,...,2,2,4,2.75,1,5,5,5,5,4.2


In [4]:
pgg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146 entries, 0 to 145
Data columns (total 47 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Year                   146 non-null    int64  
 1   Year Quant             146 non-null    int64  
 2   Participant Public ID  146 non-null    int64  
 3   Sex                    146 non-null    object 
 4   Sex-quantised          146 non-null    int64  
 5   Age                    146 non-null    int64  
 6   Postcode               146 non-null    object 
 7   IMD Score              134 non-null    float64
 8   Crime Rank             134 non-null    float64
 9   CFC Q1                 146 non-null    int64  
 10  CFC Q1-quantised       146 non-null    int64  
 11  CFC Q2                 146 non-null    int64  
 12  CFC Q2-quantised       146 non-null    int64  
 13  CFC Q3                 146 non-null    int64  
 14  CFC Q3-quantised       146 non-null    int64  
 15  Q3 Rev

In [5]:
pgg.select_dtypes(include = 'object').head()

Unnamed: 0,Sex,Postcode
0,Female,ig1 2bh
1,Female,N127NL
2,Male,rg41 1hr
3,Male,EN5 2PA
4,Female,gl22ey


In [6]:
pgg.select_dtypes(include = 'number').head()

Unnamed: 0,Year,Year Quant,Participant Public ID,Sex-quantised,Age,IMD Score,Crime Rank,CFC Q1,CFC Q1-quantised,CFC Q2,...,CFC Q12,CFC Q12-quantised,Q12 Reversed,CFC Mean Score,PGG Round 1,PGG Round 2,PGG Round 3,PGG Round 4,PGG Round 5,PGG Mean Contribution
0,2021,0,1634,2,33,6300.0,14579.0,3,3,3,...,3,3,3,3.0,5,5,5,5,5,5.0
1,2021,0,3838,2,23,16989.0,10309.0,4,4,3,...,3,3,3,2.583333,1,9,1,1,9,4.2
2,2021,0,3961,1,21,32540.0,29044.0,3,3,3,...,3,3,3,3.0,2,0,0,1,6,1.8
3,2021,0,4041,1,22,6697.0,6270.0,5,5,3,...,3,3,3,3.0,1,1,2,3,0,1.4
4,2021,0,3830,2,34,19017.0,13872.0,5,5,4,...,2,2,4,2.75,1,5,5,5,5,4.2


In [7]:
pgg.select_dtypes(exclude = 'number').head()

Unnamed: 0,Sex,Postcode
0,Female,ig1 2bh
1,Female,N127NL
2,Male,rg41 1hr
3,Male,EN5 2PA
4,Female,gl22ey


In [8]:
# Reduce data frame size

# Looking at the pgg data frame

pgg.info(memory_usage = 'deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146 entries, 0 to 145
Data columns (total 47 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Year                   146 non-null    int64  
 1   Year Quant             146 non-null    int64  
 2   Participant Public ID  146 non-null    int64  
 3   Sex                    146 non-null    object 
 4   Sex-quantised          146 non-null    int64  
 5   Age                    146 non-null    int64  
 6   Postcode               146 non-null    object 
 7   IMD Score              134 non-null    float64
 8   Crime Rank             134 non-null    float64
 9   CFC Q1                 146 non-null    int64  
 10  CFC Q1-quantised       146 non-null    int64  
 11  CFC Q2                 146 non-null    int64  
 12  CFC Q2-quantised       146 non-null    int64  
 13  CFC Q3                 146 non-null    int64  
 14  CFC Q3-quantised       146 non-null    int64  
 15  Q3 Rev

In [9]:
# Can reduce the size of the data frame by only importing the columns of interest

colspgg = ['PGG Mean Contribution', 'CFC Mean Score', 'Sex', 'IMD Score']

small_pgg = pd.read_csv('PGG Combined Data 2021 & 2022.csv', usecols = colspgg)

small_pgg.info(memory_usage = 'deep')

# This has reduced the size of the dataframe and the memory usage.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146 entries, 0 to 145
Data columns (total 4 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Sex                    146 non-null    object 
 1   IMD Score              134 non-null    float64
 2   CFC Mean Score         146 non-null    float64
 3   PGG Mean Contribution  146 non-null    float64
dtypes: float64(3), object(1)
memory usage: 12.5 KB


In [10]:
small_pgg.head()

Unnamed: 0,Sex,IMD Score,CFC Mean Score,PGG Mean Contribution
0,Female,6300.0,3.0,5.0
1,Female,16989.0,2.583333,4.2
2,Male,32540.0,3.0,1.8
3,Male,6697.0,3.0,1.4
4,Female,19017.0,2.75,4.2


In [11]:
# A further way to reduce the memory usage is to convert objects to categories. 
# Here converting the sex category

dtypes = {'Sex': 'category'}

smaller_pgg = pd.read_csv('PGG Combined Data 2021 & 2022.csv', usecols = colspgg, dtype = dtypes)
smaller_pgg.info(memory_usage = 'deep')
# Note here that I am reading the data frame in, selecting only the columns specified in colspgg
# using usecols, and then converting sex to category using dtype. 
# This has reduced the memory usage from 12.5 to 3.8KB.
# The category data type will only reduce memory usage if you have a small number of categories
# relative to the number of rows. 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146 entries, 0 to 145
Data columns (total 4 columns):
 #   Column                 Non-Null Count  Dtype   
---  ------                 --------------  -----   
 0   Sex                    146 non-null    category
 1   IMD Score              134 non-null    float64 
 2   CFC Mean Score         146 non-null    float64 
 3   PGG Mean Contribution  146 non-null    float64 
dtypes: category(1), float64(3)
memory usage: 3.8 KB


In [12]:
# Can ask for a summary of the data using describe but the result might display
# more information than you need:

pgg.describe()

Unnamed: 0,Year,Year Quant,Participant Public ID,Sex-quantised,Age,IMD Score,Crime Rank,CFC Q1,CFC Q1-quantised,CFC Q2,...,CFC Q12,CFC Q12-quantised,Q12 Reversed,CFC Mean Score,PGG Round 1,PGG Round 2,PGG Round 3,PGG Round 4,PGG Round 5,PGG Mean Contribution
count,146.0,146.0,146.0,146.0,146.0,134.0,134.0,146.0,146.0,146.0,...,146.0,146.0,146.0,146.0,146.0,146.0,146.0,146.0,146.0,146.0
mean,2021.452055,0.452055,4010.054795,1.869863,22.513699,13682.141791,12477.69403,3.726027,3.726027,3.226027,...,2.972603,2.972603,3.027397,3.074201,5.856164,4.883562,6.582192,7.287671,6.842466,6.290411
std,0.499409,0.499409,272.313556,0.337612,6.696614,7449.516356,7228.431482,0.867184,0.867184,0.884897,...,0.870554,0.870554,0.870554,0.455608,3.046739,3.061463,3.139859,3.09161,3.137511,2.294711
min,2021.0,0.0,1634.0,1.0,18.0,6.0,91.0,1.0,1.0,1.0,...,1.0,1.0,1.0,2.166667,0.0,0.0,0.0,0.0,0.0,0.0
25%,2021.0,0.0,3898.25,2.0,19.0,7775.75,6338.5,3.0,3.0,3.0,...,2.0,2.0,3.0,2.770833,4.0,2.0,5.0,5.0,5.0,4.8
50%,2021.0,0.0,4017.0,2.0,20.0,12752.0,12074.0,4.0,4.0,3.0,...,3.0,3.0,3.0,3.0,5.0,5.0,7.0,9.0,7.0,6.6
75%,2022.0,1.0,4174.75,2.0,23.0,18836.0,17093.75,4.0,4.0,4.0,...,3.0,3.0,4.0,3.333333,9.75,7.0,10.0,10.0,10.0,8.0
max,2022.0,1.0,4364.0,2.0,57.0,32540.0,32190.0,5.0,5.0,5.0,...,5.0,5.0,5.0,4.75,10.0,10.0,10.0,10.0,10.0,10.0


In [13]:
# If you only wanted to show the five number summary you can use the loc method 
# and pass it the min through the max row labels:

pgg.describe().loc['min':'max']


Unnamed: 0,Year,Year Quant,Participant Public ID,Sex-quantised,Age,IMD Score,Crime Rank,CFC Q1,CFC Q1-quantised,CFC Q2,...,CFC Q12,CFC Q12-quantised,Q12 Reversed,CFC Mean Score,PGG Round 1,PGG Round 2,PGG Round 3,PGG Round 4,PGG Round 5,PGG Mean Contribution
min,2021.0,0.0,1634.0,1.0,18.0,6.0,91.0,1.0,1.0,1.0,...,1.0,1.0,1.0,2.166667,0.0,0.0,0.0,0.0,0.0,0.0
25%,2021.0,0.0,3898.25,2.0,19.0,7775.75,6338.5,3.0,3.0,3.0,...,2.0,2.0,3.0,2.770833,4.0,2.0,5.0,5.0,5.0,4.8
50%,2021.0,0.0,4017.0,2.0,20.0,12752.0,12074.0,4.0,4.0,3.0,...,3.0,3.0,3.0,3.0,5.0,5.0,7.0,9.0,7.0,6.6
75%,2022.0,1.0,4174.75,2.0,23.0,18836.0,17093.75,4.0,4.0,4.0,...,3.0,3.0,4.0,3.333333,9.75,7.0,10.0,10.0,10.0,8.0
max,2022.0,1.0,4364.0,2.0,57.0,32540.0,32190.0,5.0,5.0,5.0,...,5.0,5.0,5.0,4.75,10.0,10.0,10.0,10.0,10.0,10.0


In [14]:
# Above we passed the loc method a slice of row labels. We can also pass it a slice
# of column labels

pgg.describe().loc['min':'max', 'Age':'CFC Q1']

Unnamed: 0,Age,IMD Score,Crime Rank,CFC Q1
min,18.0,6.0,91.0,1.0
25%,19.0,7775.75,6338.5,3.0
50%,20.0,12752.0,12074.0,4.0
75%,23.0,18836.0,17093.75,4.0
max,57.0,32540.0,32190.0,5.0
