In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.DataFrame({
    'Population': [35.467, 63.951, 80.94 , 60.665, 127.061, 64.511, 318.523],
    'GDP': [
        1785387,
        2833687,
        3874437,
        2167744,
        4602367,
        2950039,
        17348075
    ],
    'Surface Area': [
        9984670,
        640679,
        357114,
        301336,
        377930,
        242495,
        9525067
    ],
    'HDI': [
        0.913,
        0.888,
        0.916,
        0.873,
        0.891,
        0.907,
        0.915
    ],
    'Continent': [
        'America',
        'Europe',
        'Europe',
        'Europe',
        'Asia',
        'Europe',
        'America'
    ]
}, columns=['Population', 'GDP', 'Surface Area', 'HDI', 'Continent'])

In [3]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
0,35.467,1785387,9984670,0.913,America
1,63.951,2833687,640679,0.888,Europe
2,80.94,3874437,357114,0.916,Europe
3,60.665,2167744,301336,0.873,Europe
4,127.061,4602367,377930,0.891,Asia
5,64.511,2950039,242495,0.907,Europe
6,318.523,17348075,9525067,0.915,America


In [4]:
df.index = [
    'Canada',
    'France',
    'Germany',
    'Italy',
    'Japan',
    'United Kingdom',
    'United States',
]
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,35.467,1785387,9984670,0.913,America
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Italy,60.665,2167744,301336,0.873,Europe
Japan,127.061,4602367,377930,0.891,Asia
United Kingdom,64.511,2950039,242495,0.907,Europe
United States,318.523,17348075,9525067,0.915,America


In [None]:
df.index

Index(['Canada', 'France', 'Germany', 'Italy', 'Japan', 'United Kingdom',
       'United States'],
      dtype='object')

In [None]:
df.shape

(7, 5)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7 entries, Canada to United States
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Population    7 non-null      float64
 1   GDP           7 non-null      int64  
 2   Surface Area  7 non-null      int64  
 3   HDI           7 non-null      float64
 4   Continent     7 non-null      object 
dtypes: float64(2), int64(2), object(1)
memory usage: 656.0+ bytes


In [None]:
df.describe()

Unnamed: 0,Population,GDP,Surface Area,HDI
count,7.0,7.0,7.0,7.0
mean,107.302571,5080248.0,3061327.0,0.900429
std,97.24997,5494020.0,4576187.0,0.016592
min,35.467,1785387.0,242495.0,0.873
25%,62.308,2500716.0,329225.0,0.8895
50%,64.511,2950039.0,377930.0,0.907
75%,104.0005,4238402.0,5082873.0,0.914
max,318.523,17348080.0,9984670.0,0.916


In [None]:
df.dtypes

Population      float64
GDP               int64
Surface Area      int64
HDI             float64
Continent        object
dtype: object

In [None]:
df.dtypes.value_counts()

int64      2
float64    2
object     1
dtype: int64

#Indexing

In [None]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,35.467,1785387,9984670,0.913,America
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Italy,60.665,2167744,301336,0.873,Europe
Japan,127.061,4602367,377930,0.891,Asia
United Kingdom,64.511,2950039,242495,0.907,Europe
United States,318.523,17348075,9525067,0.915,America


In [None]:
df.loc['Italy'] #Works Horizontally(Series)

Population       60.665
GDP             2167744
Surface Area     301336
HDI               0.873
Continent        Europe
Name: Italy, dtype: object

In [None]:
df.iloc[3] #we can see the difference b/w loc and iloc

Population       60.665
GDP             2167744
Surface Area     301336
HDI               0.873
Continent        Europe
Name: Italy, dtype: object

In [None]:
df['Population'] #Works Vertically(Series)

Canada             35.467
France             63.951
Germany            80.940
Italy              60.665
Japan             127.061
United Kingdom     64.511
United States     318.523
Name: Population, dtype: float64

In [None]:
df.loc['France':'Italy']

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Italy,60.665,2167744,301336,0.873,Europe


In [None]:
df.loc['France':'Japan','Population']

France      63.951
Germany     80.940
Italy       60.665
Japan      127.061
Name: Population, dtype: float64

In [None]:
df.loc['France':'Japan',['Population','HDI']]

Unnamed: 0,Population,HDI
France,63.951,0.888
Germany,80.94,0.916
Italy,60.665,0.873
Japan,127.061,0.891


In [None]:
df.iloc[1:3]

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe


In [None]:
df.iloc[1:3,2]

France     640679
Germany    357114
Name: Surface Area, dtype: int64

In [None]:
df.iloc[0:2,[2,4]]

Unnamed: 0,Surface Area,Continent
Canada,9984670,America
France,640679,Europe


In [None]:
df.iloc[0:2,2:4]

Unnamed: 0,Surface Area,HDI
Canada,9984670,0.913
France,640679,0.888


#Conditional Selection(Boolean Arrays(DataFrames))

In [None]:
df['Population'] > 70

Canada            False
France            False
Germany            True
Italy             False
Japan              True
United Kingdom    False
United States      True
Name: Population, dtype: bool

In [None]:
df.loc[df['Population']>70]

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Germany,80.94,3874437,357114,0.916,Europe
Japan,127.061,4602367,377930,0.891,Asia
United States,318.523,17348075,9525067,0.915,America


In [None]:
df.loc[df['Population']>70 , 'Population'] #After comma we are asking for only Population column to display

Germany           80.940
Japan            127.061
United States    318.523
Name: Population, dtype: float64

In [None]:
df.loc[df['Population']>70, ['Population','Surface Area']]

Unnamed: 0,Population,Surface Area
Germany,80.94,357114
Japan,127.061,377930
United States,318.523,9525067


#Dropping Stuff

In [None]:
df.drop('Italy')

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,35.467,1785387,9984670,0.913,America
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Japan,127.061,4602367,377930,0.891,Asia
United Kingdom,64.511,2950039,242495,0.907,Europe
United States,318.523,17348075,9525067,0.915,America


In [None]:
df.drop(['Canada','France'])

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Germany,80.94,3874437,357114,0.916,Europe
Italy,60.665,2167744,301336,0.873,Europe
Japan,127.061,4602367,377930,0.891,Asia
United Kingdom,64.511,2950039,242495,0.907,Europe
United States,318.523,17348075,9525067,0.915,America


In [None]:
df.drop(columns='Continent')

Unnamed: 0,Population,GDP,Surface Area,HDI
Canada,35.467,1785387,9984670,0.913
France,63.951,2833687,640679,0.888
Germany,80.94,3874437,357114,0.916
Italy,60.665,2167744,301336,0.873
Japan,127.061,4602367,377930,0.891
United Kingdom,64.511,2950039,242495,0.907
United States,318.523,17348075,9525067,0.915


In [None]:
df = df.drop(columns=['Continent','HDI',])

#Operations

In [None]:
df

Unnamed: 0,Population,GDP,Surface Area
Canada,35.467,1785387,9984670
France,63.951,2833687,640679
Germany,80.94,3874437,357114
Italy,60.665,2167744,301336
Japan,127.061,4602367,377930
United Kingdom,64.511,2950039,242495
United States,318.523,17348075,9525067


In [None]:
crisis = pd.Series([-50,-1000000,-100000], index=['Population','GDP','Surface Area'])

In [None]:
df = df + crisis

In [None]:
df

Unnamed: 0,Population,GDP,Surface Area
Canada,-14.533,785387,9884670
France,13.951,1833687,540679
Germany,30.94,2874437,257114
Italy,10.665,1167744,201336
Japan,77.061,3602367,277930
United Kingdom,14.511,1950039,142495
United States,268.523,16348075,9425067


#Modifying Data Frames

When we were dropping stuff before. If we press df in new cell we will get back the original df back. That's becoz these operations are immutable

In [5]:
#Adding a New Column
lang = pd.Series(['French','English','Japanese'],
                 index=['France','United States','Japan'], name='Language')

In [6]:
df['Languages'] = lang

In [7]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Languages
Canada,35.467,1785387,9984670,0.913,America,
France,63.951,2833687,640679,0.888,Europe,French
Germany,80.94,3874437,357114,0.916,Europe,
Italy,60.665,2167744,301336,0.873,Europe,
Japan,127.061,4602367,377930,0.891,Asia,Japanese
United Kingdom,64.511,2950039,242495,0.907,Europe,
United States,318.523,17348075,9525067,0.915,America,English


In [9]:
#Changing the values
df['Languages'] = 'English'
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Languages
Canada,35.467,1785387,9984670,0.913,America,English
France,63.951,2833687,640679,0.888,Europe,English
Germany,80.94,3874437,357114,0.916,Europe,English
Italy,60.665,2167744,301336,0.873,Europe,English
Japan,127.061,4602367,377930,0.891,Asia,English
United Kingdom,64.511,2950039,242495,0.907,Europe,English
United States,318.523,17348075,9525067,0.915,America,English


Renaming the Columns

In [12]:
df = df.rename(
    columns={'HDI': 'Human Development Index'},
    index={'United Kingdom': 'UK',
           'United States': 'US'}
)
df

Unnamed: 0,Population,GDP,Surface Area,Human Development Index,Continent,Languages
Canada,35.467,1785387,9984670,0.913,America,English
France,63.951,2833687,640679,0.888,Europe,English
Germany,80.94,3874437,357114,0.916,Europe,English
Italy,60.665,2167744,301336,0.873,Europe,English
Japan,127.061,4602367,377930,0.891,Asia,English
UK,64.511,2950039,242495,0.907,Europe,English
US,318.523,17348075,9525067,0.915,America,English


In [13]:
df.rename(index= str.upper)

Unnamed: 0,Population,GDP,Surface Area,Human Development Index,Continent,Languages
CANADA,35.467,1785387,9984670,0.913,America,English
FRANCE,63.951,2833687,640679,0.888,Europe,English
GERMANY,80.94,3874437,357114,0.916,Europe,English
ITALY,60.665,2167744,301336,0.873,Europe,English
JAPAN,127.061,4602367,377930,0.891,Asia,English
UK,64.511,2950039,242495,0.907,Europe,English
US,318.523,17348075,9525067,0.915,America,English


#Creating Columns from other Columns

In [15]:
df[['Population','Continent']]

Unnamed: 0,Population,Continent
Canada,35.467,America
France,63.951,Europe
Germany,80.94,Europe
Italy,60.665,Europe
Japan,127.061,Asia
UK,64.511,Europe
US,318.523,America


In [16]:
df['GDP']/df['Population']

Canada     50339.385908
France     44310.284437
Germany    47868.013343
Italy      35733.025633
Japan      36221.712406
UK         45729.239975
US         54464.120330
dtype: float64

In [18]:
df['GDP per capital'] = df['GDP']/df['Population']

In [19]:
df

Unnamed: 0,Population,GDP,Surface Area,Human Development Index,Continent,Languages,GDP per capital
Canada,35.467,1785387,9984670,0.913,America,English,50339.385908
France,63.951,2833687,640679,0.888,Europe,English,44310.284437
Germany,80.94,3874437,357114,0.916,Europe,English,47868.013343
Italy,60.665,2167744,301336,0.873,Europe,English,35733.025633
Japan,127.061,4602367,377930,0.891,Asia,English,36221.712406
UK,64.511,2950039,242495,0.907,Europe,English,45729.239975
US,318.523,17348075,9525067,0.915,America,English,54464.12033


#Statastical Information

In [20]:
df.head()

Unnamed: 0,Population,GDP,Surface Area,Human Development Index,Continent,Languages,GDP per capital
Canada,35.467,1785387,9984670,0.913,America,English,50339.385908
France,63.951,2833687,640679,0.888,Europe,English,44310.284437
Germany,80.94,3874437,357114,0.916,Europe,English,47868.013343
Italy,60.665,2167744,301336,0.873,Europe,English,35733.025633
Japan,127.061,4602367,377930,0.891,Asia,English,36221.712406


In [21]:
df.describe()

Unnamed: 0,Population,GDP,Surface Area,Human Development Index,GDP per capital
count,7.0,7.0,7.0,7.0,7.0
mean,107.302571,5080248.0,3061327.0,0.900429,44952.254576
std,97.24997,5494020.0,4576187.0,0.016592,6954.983875
min,35.467,1785387.0,242495.0,0.873,35733.025633
25%,62.308,2500716.0,329225.0,0.8895,40265.998421
50%,64.511,2950039.0,377930.0,0.907,45729.239975
75%,104.0005,4238402.0,5082873.0,0.914,49103.699626
max,318.523,17348080.0,9984670.0,0.916,54464.12033


In [22]:
df.max()

Population                  318.523
GDP                        17348075
Surface Area                9984670
Human Development Index       0.916
Continent                    Europe
Languages                   English
GDP per capital             54464.1
dtype: object

In [23]:
pop = df['Population']
pop

Canada      35.467
France      63.951
Germany     80.940
Italy       60.665
Japan      127.061
UK          64.511
US         318.523
Name: Population, dtype: float64

In [24]:
pop.mean()

107.30257142857144

In [25]:
pop.std()

97.24996987121581

In [26]:
pop.var()

9457.556639952383

In [27]:
np.sum(pop)/len(pop)

107.30257142857144

In [28]:
pop.quantile(.25)

62.308