Pandas - DataFrames

In [87]:
import pandas as pd 
import numpy as np 

In [88]:
df = pd.DataFrame(
    {
        'Population': [35.467, 63.951, 80.940, 60.665, 127.061, 64.511, 318.523],
        'GDP': [1785387, 2833687, 3874437, 2167744, 4602367, 2950039, 17348075],
        'Surface Area': [9984670, 640679, 357114, 301336, 377930, 242495, 9525067],
        'HDI': [0.913, 0.888, 0.916, 0.873, 0.891, 0.907, 0.915],
        'Continent': ['America', 'Europe', 'Europe', 'Europe', 'Asia', 'Europe', 'America']
    }, columns=['Population', 'GDP', 'Surface Area', 'HDI', 'Continent']
)

In [89]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
0,35.467,1785387,9984670,0.913,America
1,63.951,2833687,640679,0.888,Europe
2,80.94,3874437,357114,0.916,Europe
3,60.665,2167744,301336,0.873,Europe
4,127.061,4602367,377930,0.891,Asia
5,64.511,2950039,242495,0.907,Europe
6,318.523,17348075,9525067,0.915,America


In [90]:
df.index = ['Canada', 'France', 'Germany', 'Italy', 'Japan', 'United Kingdom', 'United States']

In [91]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,35.467,1785387,9984670,0.913,America
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Italy,60.665,2167744,301336,0.873,Europe
Japan,127.061,4602367,377930,0.891,Asia
United Kingdom,64.511,2950039,242495,0.907,Europe
United States,318.523,17348075,9525067,0.915,America


In [92]:
df.columns

Index(['Population', 'GDP', 'Surface Area', 'HDI', 'Continent'], dtype='object')

In [93]:
df.index

Index(['Canada', 'France', 'Germany', 'Italy', 'Japan', 'United Kingdom',
       'United States'],
      dtype='object')

In [94]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7 entries, Canada to United States
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Population    7 non-null      float64
 1   GDP           7 non-null      int64  
 2   Surface Area  7 non-null      int64  
 3   HDI           7 non-null      float64
 4   Continent     7 non-null      object 
dtypes: float64(2), int64(2), object(1)
memory usage: 336.0+ bytes


In [95]:
df.size

35

In [96]:
df.shape

(7, 5)

In [97]:
df.describe()

Unnamed: 0,Population,GDP,Surface Area,HDI
count,7.0,7.0,7.0,7.0
mean,107.302571,5080248.0,3061327.0,0.900429
std,97.24997,5494020.0,4576187.0,0.016592
min,35.467,1785387.0,242495.0,0.873
25%,62.308,2500716.0,329225.0,0.8895
50%,64.511,2950039.0,377930.0,0.907
75%,104.0005,4238402.0,5082873.0,0.914
max,318.523,17348080.0,9984670.0,0.916


In [98]:
df.dtypes

Population      float64
GDP               int64
Surface Area      int64
HDI             float64
Continent        object
dtype: object

In [99]:
df.dtypes.value_counts()

int64      2
float64    2
object     1
dtype: int64

Indexing, Selcting and Slicing

In [100]:
df['Population']

Canada             35.467
France             63.951
Germany            80.940
Italy              60.665
Japan             127.061
United Kingdom     64.511
United States     318.523
Name: Population, dtype: float64

In [101]:
df['Population'].to_frame()

Unnamed: 0,Population
Canada,35.467
France,63.951
Germany,80.94
Italy,60.665
Japan,127.061
United Kingdom,64.511
United States,318.523


In [102]:
df.loc['Canada']

Population       35.467
GDP             1785387
Surface Area    9984670
HDI               0.913
Continent       America
Name: Canada, dtype: object

In [103]:
df.iloc[-1]

Population       318.523
GDP             17348075
Surface Area     9525067
HDI                0.915
Continent        America
Name: United States, dtype: object

In [104]:
df.loc['France': 'Italy']

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Italy,60.665,2167744,301336,0.873,Europe


In [105]:
df['France': 'Italy']

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Italy,60.665,2167744,301336,0.873,Europe


In [106]:
df.loc['France': 'Italy', 'Population']

France     63.951
Germany    80.940
Italy      60.665
Name: Population, dtype: float64

In [107]:
df.loc['France': 'Italy', ['Population', 'GDP']]

Unnamed: 0,Population,GDP
France,63.951,2833687
Germany,80.94,3874437
Italy,60.665,2167744


In [108]:
df.iloc[1:3, 3]

France     0.888
Germany    0.916
Name: HDI, dtype: float64

In [109]:
df.iloc[1:3, 1:3]

Unnamed: 0,GDP,Surface Area
France,2833687,640679
Germany,3874437,357114


Conditional Selection (Boolean Arrays)

In [110]:
df['Population'] > 70

Canada            False
France            False
Germany            True
Italy             False
Japan              True
United Kingdom    False
United States      True
Name: Population, dtype: bool

In [111]:
df.loc[df['Population'] > 70, 'Population']

Germany           80.940
Japan            127.061
United States    318.523
Name: Population, dtype: float64

In [112]:
df.loc[df['Population'] > 70, ['Population', 'GDP']]

Unnamed: 0,Population,GDP
Germany,80.94,3874437
Japan,127.061,4602367
United States,318.523,17348075


In [113]:
# Dropping Table
df.drop(['Canada', 'Italy']) 

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Japan,127.061,4602367,377930,0.891,Asia
United Kingdom,64.511,2950039,242495,0.907,Europe
United States,318.523,17348075,9525067,0.915,America


In [114]:
df.drop(columns=['HDI'])

Unnamed: 0,Population,GDP,Surface Area,Continent
Canada,35.467,1785387,9984670,America
France,63.951,2833687,640679,Europe
Germany,80.94,3874437,357114,Europe
Italy,60.665,2167744,301336,Europe
Japan,127.061,4602367,377930,Asia
United Kingdom,64.511,2950039,242495,Europe
United States,318.523,17348075,9525067,America


Operations with Series

In [115]:
crisis = pd.Series([1_000_000, -1_000_000], index=['Population', 'GDP'])
crisis

Population    1000000
GDP          -1000000
dtype: int64

In [116]:
df[['Population', 'GDP']] + crisis

Unnamed: 0,Population,GDP
Canada,1000035.467,785387
France,1000063.951,1833687
Germany,1000080.94,2874437
Italy,1000060.665,1167744
Japan,1000127.061,3602367
United Kingdom,1000064.511,1950039
United States,1000318.523,16348075


In [117]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,35.467,1785387,9984670,0.913,America
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Italy,60.665,2167744,301336,0.873,Europe
Japan,127.061,4602367,377930,0.891,Asia
United Kingdom,64.511,2950039,242495,0.907,Europe
United States,318.523,17348075,9525067,0.915,America


Modifying Dataframes

In [118]:

langs = pd.Series(
    ['French', 'German', 'Italian'],
    index=['France', 'Germany', 'Italy'],
    name='Language'
)

In [119]:
langs

France      French
Germany     German
Italy      Italian
Name: Language, dtype: object

In [120]:
df['Language'] = langs

In [121]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Language
Canada,35.467,1785387,9984670,0.913,America,
France,63.951,2833687,640679,0.888,Europe,French
Germany,80.94,3874437,357114,0.916,Europe,German
Italy,60.665,2167744,301336,0.873,Europe,Italian
Japan,127.061,4602367,377930,0.891,Asia,
United Kingdom,64.511,2950039,242495,0.907,Europe,
United States,318.523,17348075,9525067,0.915,America,


In [124]:
# Replacing values per column
df['language'] = 'English'

In [125]:
df


Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Language,language
Canada,35.467,1785387,9984670,0.913,America,,English
France,63.951,2833687,640679,0.888,Europe,French,English
Germany,80.94,3874437,357114,0.916,Europe,German,English
Italy,60.665,2167744,301336,0.873,Europe,Italian,English
Japan,127.061,4602367,377930,0.891,Asia,,English
United Kingdom,64.511,2950039,242495,0.907,Europe,,English
United States,318.523,17348075,9525067,0.915,America,,English


In [128]:
# Renaming Columns
df.rename(
    columns = {
        'HDI': 'Human Development Index' 
    }, index = {
        'United States' : 'USA',
        'United Kingdom' : 'UK'
    }
)

Unnamed: 0,Population,GDP,Surface Area,Human Development Index,Continent,Language,language
Canada,35.467,1785387,9984670,0.913,America,,English
France,63.951,2833687,640679,0.888,Europe,French,English
Germany,80.94,3874437,357114,0.916,Europe,German,English
Italy,60.665,2167744,301336,0.873,Europe,Italian,English
Japan,127.061,4602367,377930,0.891,Asia,,English
UK,64.511,2950039,242495,0.907,Europe,,English
USA,318.523,17348075,9525067,0.915,America,,English


In [131]:
# Creating columns combination of other columns
df['GDP per capita'] = df['GDP'] / df['Population']

In [132]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Language,language,GDP per capita
Canada,35.467,1785387,9984670,0.913,America,,English,50339.385908
France,63.951,2833687,640679,0.888,Europe,French,English,44310.284437
Germany,80.94,3874437,357114,0.916,Europe,German,English,47868.013343
Italy,60.665,2167744,301336,0.873,Europe,Italian,English,35733.025633
Japan,127.061,4602367,377930,0.891,Asia,,English,36221.712406
United Kingdom,64.511,2950039,242495,0.907,Europe,,English,45729.239975
United States,318.523,17348075,9525067,0.915,America,,English,54464.12033
