# Recap: pandas Basics

Insert one-liners to solve all 10 tasks

In [1]:
import pandas as pd

### read the CSV file `gapminder_total_fertility.csv`

In [2]:
df = pd.read_csv('gapminder_total_fertility.csv')

### write the file to an Excel spreadsheet

In [3]:
df.to_excel('gapminder_total_fertility.xlsx')

### read the spreadsheet again

In [4]:
df = pd.read_csv('gapminder_total_fertility.csv')

### select all data for the year 2010

In [6]:
df[['Total fertility rate','2010']]

Unnamed: 0,Total fertility rate,2010
0,Abkhazia,
1,Afghanistan,5.66
2,Akrotiri and Dhekelia,
3,Albania,1.74
4,Algeria,2.82
...,...,...
255,Yugoslavia,
256,Zambia,5.81
257,Zimbabwe,3.72
258,Åland,


### select the 10 most fertile countries in 2010

In [7]:
(df[['Total fertility rate','2010']].dropna().sort_values(by='2010')).iloc[-10:]

Unnamed: 0,Total fertility rate,2010
164,Nigeria,6.02
236,Uganda,6.16
7,Angola,6.22
225,Timor-Leste,6.24
49,"Congo, Dem. Rep.",6.25
34,Burundi,6.3
41,Chad,6.6
137,Mali,6.84
208,Somalia,6.87
163,Niger,7.58


### select countries below 2.0 in 2010

In [8]:
df[['Total fertility rate','2010']].where(df['2010'] < 2).dropna()

Unnamed: 0,Total fertility rate,2010
3,Albania,1.74
11,Armenia,1.55
12,Aruba,1.70
13,Australia,1.89
14,Austria,1.44
...,...,...
237,Ukraine,1.44
238,United Arab Emirates,1.87
239,United Kingdom,1.90
240,United States,1.93


### calculate the mean and standard deviation for each year

In [9]:
df.agg(['mean','std'])

Unnamed: 0,1800,1801,1802,1803,1804,1805,1806,1807,1808,1809,...,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
mean,6.08408,6.081244,6.084328,6.083383,6.083682,6.083433,6.079353,6.078159,6.070647,6.059701,...,2.999403,2.971642,2.943881,2.915771,2.884925,2.853433,2.823284,2.794179,2.775176,2.744523
std,0.777465,0.780857,0.775431,0.774806,0.774999,0.771939,0.776931,0.777962,0.791615,0.817953,...,1.591173,1.561309,1.531775,1.502406,1.475264,1.447739,1.418326,1.389848,1.362167,1.332827


### extract the first character as a separate column

In [10]:
df['first character'] = df['Total fertility rate'].str[0]
df

Unnamed: 0,Total fertility rate,1800,1801,1802,1803,1804,1805,1806,1807,1808,...,2007,2008,2009,2010,2011,2012,2013,2014,2015,first character
0,Abkhazia,,,,,,,,,,...,,,,,,,,,,A
1,Afghanistan,7.00,7.00,7.00,7.00,7.00,7.00,7.00,7.00,7.00,...,6.46,6.20,5.93,5.66,5.40,5.14,4.90,4.68,4.47,A
2,Akrotiri and Dhekelia,,,,,,,,,,...,,,,,,,,,,A
3,Albania,4.60,4.60,4.60,4.60,4.60,4.60,4.60,4.60,4.60,...,1.80,1.76,1.74,1.74,1.75,1.76,1.77,1.78,1.78,A
4,Algeria,6.99,6.99,6.99,6.99,6.99,6.99,6.99,6.99,6.99,...,2.66,2.73,2.78,2.82,2.83,2.82,2.80,2.76,2.71,A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
255,Yugoslavia,,,,,,,,,,...,,,,,,,,,,Y
256,Zambia,6.71,6.71,6.71,6.71,6.71,6.71,6.71,6.71,6.71,...,5.91,5.88,5.85,5.81,5.77,5.73,5.69,5.64,5.59,Z
257,Zimbabwe,6.75,6.75,6.75,6.75,6.75,6.75,6.75,6.75,6.75,...,3.90,3.85,3.79,3.72,3.64,3.56,3.49,3.41,3.35,Z
258,Åland,,,,,,,,,,...,,,,,,,,,,Å


### count the number of countries for each first character

In [16]:
df[['Total fertility rate','first character']].groupby('first character').agg('count')

Unnamed: 0_level_0,Total fertility rate
first character,Unnamed: 1_level_1
A,16
B,19
C,24
D,4
E,9
F,7
G,16
H,5
I,9
J,4
