# Pandas Dataframe

* probably the most important data structure of pandas is the Dataframe

* It's a tabular structure tightly integrated with Series

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.DataFrame({
    'Population': [35.467, 63.951, 80.94 , 60.665, 127.061, 64.511, 318.523],
    'GDP': [
        1785387,
        2833687,
        3874437,
        2167744,
        4602367,
        2950039,
        17348075
    ],
    'Surface Area': [
        2837032,
        23487325,
        3651438,
        2514490,
        1972476,
        2429036,
        9525067
    ],
    'HDI': [
        0.913,
        0.888,
        0.916,
        0.873,
        0.891,
        0.907,
        0.915
    ],
    'Continent': [
        'America',
        'Europe',
        'Europe',
        'Europe',
        'Asia',
        'Europe',
        'America'
    ]
}, columns=['Population', 'GDP', 'Surface Area', 'HDI', 'Continent'])


In [3]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
0,35.467,1785387,2837032,0.913,America
1,63.951,2833687,23487325,0.888,Europe
2,80.94,3874437,3651438,0.916,Europe
3,60.665,2167744,2514490,0.873,Europe
4,127.061,4602367,1972476,0.891,Asia
5,64.511,2950039,2429036,0.907,Europe
6,318.523,17348075,9525067,0.915,America


In [4]:
df.index = [
    'Canada',
    'France',
    'Germany',
    'Italy',
    'Japan',
    'UK',
    'US'
]

In [5]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,35.467,1785387,2837032,0.913,America
France,63.951,2833687,23487325,0.888,Europe
Germany,80.94,3874437,3651438,0.916,Europe
Italy,60.665,2167744,2514490,0.873,Europe
Japan,127.061,4602367,1972476,0.891,Asia
UK,64.511,2950039,2429036,0.907,Europe
US,318.523,17348075,9525067,0.915,America


In [6]:
df.columns

Index(['Population', 'GDP', 'Surface Area', 'HDI', 'Continent'], dtype='object')

In [7]:
df.index

Index(['Canada', 'France', 'Germany', 'Italy', 'Japan', 'UK', 'US'], dtype='object')

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7 entries, Canada to US
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Population    7 non-null      float64
 1   GDP           7 non-null      int64  
 2   Surface Area  7 non-null      int64  
 3   HDI           7 non-null      float64
 4   Continent     7 non-null      object 
dtypes: float64(2), int64(2), object(1)
memory usage: 336.0+ bytes


In [9]:
df.size

35

In [10]:
df.shape

(7, 5)

In [11]:
df.describe()

Unnamed: 0,Population,GDP,Surface Area,HDI
count,7.0,7.0,7.0,7.0
mean,107.302571,5080248.0,6630981.0,0.900429
std,97.24997,5494020.0,7874932.0,0.016592
min,35.467,1785387.0,1972476.0,0.873
25%,62.308,2500716.0,2471763.0,0.8895
50%,64.511,2950039.0,2837032.0,0.907
75%,104.0005,4238402.0,6588252.0,0.914
max,318.523,17348080.0,23487320.0,0.916


In [12]:
df.dtypes

Population      float64
GDP               int64
Surface Area      int64
HDI             float64
Continent        object
dtype: object

In [13]:
df.dtypes.value_counts()

float64    2
int64      2
object     1
Name: count, dtype: int64

## Indexing , Selection and Slicing 

* Individual columns in the Dataframe can be selected with regular indexing.
* Each column is represented as Series

In [14]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,35.467,1785387,2837032,0.913,America
France,63.951,2833687,23487325,0.888,Europe
Germany,80.94,3874437,3651438,0.916,Europe
Italy,60.665,2167744,2514490,0.873,Europe
Japan,127.061,4602367,1972476,0.891,Asia
UK,64.511,2950039,2429036,0.907,Europe
US,318.523,17348075,9525067,0.915,America


In [15]:
df.loc['Canada'] # selecting rows by index

Population       35.467
GDP             1785387
Surface Area    2837032
HDI               0.913
Continent       America
Name: Canada, dtype: object

In [16]:
df.iloc[-1] # selecting rows by sequential precision

Population       318.523
GDP             17348075
Surface Area     9525067
HDI                0.915
Continent        America
Name: US, dtype: object

In [17]:
df['Population'] # accessing entire column

Canada      35.467
France      63.951
Germany     80.940
Italy       60.665
Japan      127.061
UK          64.511
US         318.523
Name: Population, dtype: float64

## Conditional Selection (Boolean Arrays)

* it works same way for dataframe as it works for series
* dataframe is a collection of series

In [18]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,35.467,1785387,2837032,0.913,America
France,63.951,2833687,23487325,0.888,Europe
Germany,80.94,3874437,3651438,0.916,Europe
Italy,60.665,2167744,2514490,0.873,Europe
Japan,127.061,4602367,1972476,0.891,Asia
UK,64.511,2950039,2429036,0.907,Europe
US,318.523,17348075,9525067,0.915,America


In [19]:
df['Population'] > 50

Canada     False
France      True
Germany     True
Italy       True
Japan       True
UK          True
US          True
Name: Population, dtype: bool

In [20]:
df.loc[df['Population'] > 70]

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Germany,80.94,3874437,3651438,0.916,Europe
Japan,127.061,4602367,1972476,0.891,Asia
US,318.523,17348075,9525067,0.915,America


In [21]:
df.loc[df['Population'] > 70 , 'Population']

Germany     80.940
Japan      127.061
US         318.523
Name: Population, dtype: float64

In [22]:
df.loc[df['Population'] > 70 , ['Population', 'GDP']]

Unnamed: 0,Population,GDP
Germany,80.94,3874437
Japan,127.061,4602367
US,318.523,17348075


## Dropping Stuff
* opposed to the concept of selection "dropping".
* instead of pointing out which values you'd like to select you could point which one you'd like to drag

In [23]:
df.drop('Canada')

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
France,63.951,2833687,23487325,0.888,Europe
Germany,80.94,3874437,3651438,0.916,Europe
Italy,60.665,2167744,2514490,0.873,Europe
Japan,127.061,4602367,1972476,0.891,Asia
UK,64.511,2950039,2429036,0.907,Europe
US,318.523,17348075,9525067,0.915,America


In [24]:
df.drop(columns=['Population', "GDP"])

Unnamed: 0,Surface Area,HDI,Continent
Canada,2837032,0.913,America
France,23487325,0.888,Europe
Germany,3651438,0.916,Europe
Italy,2514490,0.873,Europe
Japan,1972476,0.891,Asia
UK,2429036,0.907,Europe
US,9525067,0.915,America


## Operations

In [25]:
df[['Population', 'GDP']]

Unnamed: 0,Population,GDP
Canada,35.467,1785387
France,63.951,2833687
Germany,80.94,3874437
Italy,60.665,2167744
Japan,127.061,4602367
UK,64.511,2950039
US,318.523,17348075


In [26]:
df[['Population', 'GDP']]/100

Unnamed: 0,Population,GDP
Canada,0.35467,17853.87
France,0.63951,28336.87
Germany,0.8094,38744.37
Italy,0.60665,21677.44
Japan,1.27061,46023.67
UK,0.64511,29500.39
US,3.18523,173480.75


In [27]:
crisis = pd.Series([-100000,-0.3], index=['GDP' , 'HDI'])

In [28]:
df[['GDP', 'HDI']] + crisis

Unnamed: 0,GDP,HDI
Canada,1685387.0,0.613
France,2733687.0,0.588
Germany,3774437.0,0.616
Italy,2067744.0,0.573
Japan,4502367.0,0.591
UK,2850039.0,0.607
US,17248075.0,0.615


## Modifying Dataframe
* simple and intutive 
* can add columns or replace values for columns without issues.

### Adding a new Column

In [29]:
langs = pd.Series(
    ['French' , 'German', 'Italian'],
    index=['France', 'Germany', 'Italy'],
    name = 'Language'
)

In [30]:
df['Language'] = langs

In [31]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Language
Canada,35.467,1785387,2837032,0.913,America,
France,63.951,2833687,23487325,0.888,Europe,French
Germany,80.94,3874437,3651438,0.916,Europe,German
Italy,60.665,2167744,2514490,0.873,Europe,Italian
Japan,127.061,4602367,1972476,0.891,Asia,
UK,64.511,2950039,2429036,0.907,Europe,
US,318.523,17348075,9525067,0.915,America,


### Replacing values per column

In [32]:
df['Language'] = 'English'

In [33]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Language
Canada,35.467,1785387,2837032,0.913,America,English
France,63.951,2833687,23487325,0.888,Europe,English
Germany,80.94,3874437,3651438,0.916,Europe,English
Italy,60.665,2167744,2514490,0.873,Europe,English
Japan,127.061,4602367,1972476,0.891,Asia,English
UK,64.511,2950039,2429036,0.907,Europe,English
US,318.523,17348075,9525067,0.915,America,English


### Renaming Columns

In [34]:
df.rename(
    columns={
        'HDI' : 'Human Development Index',
        'Anual Population' : 'Annual Population'
    },
    index={
        'United Kingdom' : 'UK',
        'United States' : 'US'
    }
)

Unnamed: 0,Population,GDP,Surface Area,Human Development Index,Continent,Language
Canada,35.467,1785387,2837032,0.913,America,English
France,63.951,2833687,23487325,0.888,Europe,English
Germany,80.94,3874437,3651438,0.916,Europe,English
Italy,60.665,2167744,2514490,0.873,Europe,English
Japan,127.061,4602367,1972476,0.891,Asia,English
UK,64.511,2950039,2429036,0.907,Europe,English
US,318.523,17348075,9525067,0.915,America,English


In [35]:
df.rename(index=str.upper)

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Language
CANADA,35.467,1785387,2837032,0.913,America,English
FRANCE,63.951,2833687,23487325,0.888,Europe,English
GERMANY,80.94,3874437,3651438,0.916,Europe,English
ITALY,60.665,2167744,2514490,0.873,Europe,English
JAPAN,127.061,4602367,1972476,0.891,Asia,English
UK,64.511,2950039,2429036,0.907,Europe,English
US,318.523,17348075,9525067,0.915,America,English


In [36]:
df.rename(index=lambda
          x: x.lower())

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Language
canada,35.467,1785387,2837032,0.913,America,English
france,63.951,2833687,23487325,0.888,Europe,English
germany,80.94,3874437,3651438,0.916,Europe,English
italy,60.665,2167744,2514490,0.873,Europe,English
japan,127.061,4602367,1972476,0.891,Asia,English
uk,64.511,2950039,2429036,0.907,Europe,English
us,318.523,17348075,9525067,0.915,America,English


## Dropping Columns

In [37]:
df.drop(columns='Language', inplace=True)

In [38]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,35.467,1785387,2837032,0.913,America
France,63.951,2833687,23487325,0.888,Europe
Germany,80.94,3874437,3651438,0.916,Europe
Italy,60.665,2167744,2514490,0.873,Europe
Japan,127.061,4602367,1972476,0.891,Asia
UK,64.511,2950039,2429036,0.907,Europe
US,318.523,17348075,9525067,0.915,America


## Adding Values

In [39]:
# df.append(pd.Series(
#         {
#             'Population' : 318.523,
#             'GDP' : 17348075,
#             'Surface Area' : 9525067,
#             'HDI' : 0.915,
#             'Continent' : 'Asia'
#         },
#         name='China'
#     )
# ) doesn't work

df.loc['China'] = [135.0, 1234567, 12345678, 0.9, 'Asia']

In [40]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,35.467,1785387,2837032,0.913,America
France,63.951,2833687,23487325,0.888,Europe
Germany,80.94,3874437,3651438,0.916,Europe
Italy,60.665,2167744,2514490,0.873,Europe
Japan,127.061,4602367,1972476,0.891,Asia
UK,64.511,2950039,2429036,0.907,Europe
US,318.523,17348075,9525067,0.915,America
China,135.0,1234567,12345678,0.9,Asia


### More Radical index changes

In [41]:
df.reset_index()

Unnamed: 0,index,Population,GDP,Surface Area,HDI,Continent
0,Canada,35.467,1785387,2837032,0.913,America
1,France,63.951,2833687,23487325,0.888,Europe
2,Germany,80.94,3874437,3651438,0.916,Europe
3,Italy,60.665,2167744,2514490,0.873,Europe
4,Japan,127.061,4602367,1972476,0.891,Asia
5,UK,64.511,2950039,2429036,0.907,Europe
6,US,318.523,17348075,9525067,0.915,America
7,China,135.0,1234567,12345678,0.9,Asia


In [42]:
df.set_index('Population')

Unnamed: 0_level_0,GDP,Surface Area,HDI,Continent
Population,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
35.467,1785387,2837032,0.913,America
63.951,2833687,23487325,0.888,Europe
80.94,3874437,3651438,0.916,Europe
60.665,2167744,2514490,0.873,Europe
127.061,4602367,1972476,0.891,Asia
64.511,2950039,2429036,0.907,Europe
318.523,17348075,9525067,0.915,America
135.0,1234567,12345678,0.9,Asia


## Creating columns from other columns

In [43]:
df[['Population', 'GDP']]

Unnamed: 0,Population,GDP
Canada,35.467,1785387
France,63.951,2833687
Germany,80.94,3874437
Italy,60.665,2167744
Japan,127.061,4602367
UK,64.511,2950039
US,318.523,17348075
China,135.0,1234567


In [44]:
df['Population'] / df['GDP']

Canada     0.000020
France     0.000023
Germany    0.000021
Italy      0.000028
Japan      0.000028
UK         0.000022
US         0.000018
China      0.000109
dtype: float64

In [45]:
df['GDP per Capita'] = df['GDP'] / df['Population']

In [46]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,GDP per Capita
Canada,35.467,1785387,2837032,0.913,America,50339.385908
France,63.951,2833687,23487325,0.888,Europe,44310.284437
Germany,80.94,3874437,3651438,0.916,Europe,47868.013343
Italy,60.665,2167744,2514490,0.873,Europe,35733.025633
Japan,127.061,4602367,1972476,0.891,Asia,36221.712406
UK,64.511,2950039,2429036,0.907,Europe,45729.239975
US,318.523,17348075,9525067,0.915,America,54464.12033
China,135.0,1234567,12345678,0.9,Asia,9144.940741


## Statistical Information

In [47]:
df.head()

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,GDP per Capita
Canada,35.467,1785387,2837032,0.913,America,50339.385908
France,63.951,2833687,23487325,0.888,Europe,44310.284437
Germany,80.94,3874437,3651438,0.916,Europe,47868.013343
Italy,60.665,2167744,2514490,0.873,Europe,35733.025633
Japan,127.061,4602367,1972476,0.891,Asia,36221.712406


In [48]:
df.describe()

Unnamed: 0,Population,GDP,Surface Area,HDI,GDP per Capita
count,8.0,8.0,8.0,8.0,8.0
mean,110.76475,4599538.0,7345318.0,0.900375,40476.340346
std,90.56694,5265062.0,7565550.0,0.015362,14203.239383
min,35.467,1234567.0,1972476.0,0.873,9144.940741
25%,63.1295,2072155.0,2493126.0,0.89025,36099.540713
50%,72.7255,2891863.0,3244235.0,0.9035,45019.762206
75%,129.04575,4056420.0,10230220.0,0.9135,48485.856484
max,318.523,17348080.0,23487320.0,0.916,54464.12033


In [49]:
population = df['Population']

In [50]:
population.min() , population.max()

(np.float64(35.467), np.float64(318.523))

In [51]:
population.sum()

np.float64(886.118)

In [52]:
population.sum() / len(population)

np.float64(110.76475)

In [53]:
population.mean()

np.float64(110.76475)

In [54]:
population.std()

np.float64(90.56693968354489)

In [55]:
population.median()

np.float64(72.7255)

In [56]:
population.describe()

count      8.00000
mean     110.76475
std       90.56694
min       35.46700
25%       63.12950
50%       72.72550
75%      129.04575
max      318.52300
Name: Population, dtype: float64

In [57]:
population.quantile(0.25)

np.float64(63.1295)

In [58]:
population.quantile(0.2)

np.float64(61.9794)