In [1]:
#Pandas Begins
import pandas as pd

In [None]:
# In millions
g7_pop = pd.Series([35.467, 63.951, 80.940, 60.665, 127.061, 64.511, 318.523])
g7_pop

0     35.467
1     63.951
2     80.940
3     60.665
4    127.061
5     64.511
6    318.523
dtype: float64

In [None]:
g7_pop.name = "Population of G7 Nations"

In [None]:
g7_pop

0     35.467
1     63.951
2     80.940
3     60.665
4    127.061
5     64.511
6    318.523
Name: Population of G7 Nations, dtype: float64

In [None]:
g7_pop.dtype

dtype('float64')

In [None]:
g7_pop.values

array([ 35.467,  63.951,  80.94 ,  60.665, 127.061,  64.511, 318.523])

In [None]:
type(g7_pop.values)

numpy.ndarray

In [None]:
g7_pop[1]

63.951

In [None]:
g7_pop.index

RangeIndex(start=0, stop=7, step=1)

In [None]:
#In contrast to lists, we can explicitly define the index
g7_pop.index = [
    'Canada',
    'France',
    'Germany',
    'Italy',
    'Japan',
    'United Kingdom',
    'United States',
]

In [None]:
g7_pop["France"]

63.951

In [None]:
g7_pop
#A numpy series is always ordered as seen

Canada             35.467
France             63.951
Germany            80.940
Italy              60.665
Japan             127.061
United Kingdom     64.511
United States     318.523
Name: Population of G7 Nations, dtype: float64

In [None]:
#We can say that Series look like "ordered dictionaries". We can actually create Series out of dictionaries:

pd.Series({
    'Canada': 35.467,
    'France': 63.951,
    'Germany': 80.94,
    'Italy': 60.665,
    'Japan': 127.061,
    'United Kingdom': 64.511,
    'United States': 318.523
}, name='G7 Population in millions')

Canada             35.467
France             63.951
Germany            80.940
Italy              60.665
Japan             127.061
United Kingdom     64.511
United States     318.523
Name: G7 Population in millions, dtype: float64

In [None]:
#Series can also be created in following way
pd.Series(
    [35.467, 63.951, 80.94, 60.665, 127.061, 64.511, 318.523],
    index=['Canada', 'France', 'Germany', 'Italy', 'Japan', 'United Kingdom',
       'United States'],
    name='G7 Population in millions')

Canada             35.467
France             63.951
Germany            80.940
Italy              60.665
Japan             127.061
United Kingdom     64.511
United States     318.523
Name: G7 Population in millions, dtype: float64

In [None]:
pd.Series(g7_pop, index=['France', 'Germany', 'Italy', 'Spain'])

France     63.951
Germany    80.940
Italy      60.665
Spain         NaN
Name: Population of G7 Nations, dtype: float64

In [None]:
#Indexing using location (just like indexing in lists)
g7_pop.iloc[-1]

318.523

In [None]:
#Multiple Indexing
g7_pop[["France", "Italy"]]

France    63.951
Italy     60.665
Name: Population of G7 Nations, dtype: float64

In [None]:
g7_pop.iloc[[0,1]]

Canada    35.467
France    63.951
Name: Population of G7 Nations, dtype: float64

In [None]:
#population > 70 mil
g7_pop>70

Canada            False
France            False
Germany            True
Italy             False
Japan              True
United Kingdom    False
United States      True
Name: Population of G7 Nations, dtype: bool

In [None]:
g7_pop[g7_pop>70]

Germany           80.940
Japan            127.061
United States    318.523
Name: Population of G7 Nations, dtype: float64

In [None]:
g7_pop.mean()

107.30257142857144

In [None]:
g7_pop[g7_pop>g7_pop.mean()]

Japan            127.061
United States    318.523
Name: Population of G7 Nations, dtype: float64

In [None]:
#IMP - slicing in pandas includes last index element as well,
#eg. here it will include Italy as well, while python wouldn't have
g7_pop["France":"Italy"].mean()

68.51866666666666

In [None]:
g7_pop[(g7_pop > g7_pop.mean() - g7_pop.std() / 2) | (g7_pop > g7_pop.mean() + g7_pop.std() / 2)]


France             63.951
Germany            80.940
Italy              60.665
Japan             127.061
United Kingdom     64.511
United States     318.523
Name: Population of G7 Nations, dtype: float64

In [None]:
g7_pop * 1000000
g7_pop * 1_000_000

Canada             35467000.0
France             63951000.0
Germany            80940000.0
Italy              60665000.0
Japan             127061000.0
United Kingdom     64511000.0
United States     318523000.0
Name: Population of G7 Nations, dtype: float64

In [None]:
np.log(g7_pop)

Canada            3.568603
France            4.158117
Germany           4.393708
Italy             4.105367
Japan             4.844667
United Kingdom    4.166836
United States     5.763695
Name: Population of G7 Nations, dtype: float64

In [None]:
g7_pop[g7_pop < 70] = 99.99
g7_pop

Canada             99.990
France             99.990
Germany            80.940
Italy              99.990
Japan             127.061
United Kingdom     99.990
United States     318.523
Name: Population of G7 Nations, dtype: float64

In [None]:
#Pandas Dataframes begins
df = pd.DataFrame({
    'Population': [35.467, 63.951, 80.94 , 60.665, 127.061, 64.511, 318.523],
    'GDP': [
        1785387,
        2833687,
        3874437,
        2167744,
        4602367,
        2950039,
        17348075
    ],
    'Surface Area': [
        9984670,
        640679,
        357114,
        301336,
        377930,
        242495,
        9525067
    ],
    'HDI': [
        0.913,
        0.888,
        0.916,
        0.873,
        0.891,
        0.907,
        0.915
    ],
    'Continent': [
        'America',
        'Europe',
        'Europe',
        'Europe',
        'Asia',
        'Europe',
        'America'
    ]
}, columns=['Population', 'GDP', 'Surface Area', 'HDI', 'Continent'])


In [None]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
0,35.467,1785387,9984670,0.913,America
1,63.951,2833687,640679,0.888,Europe
2,80.94,3874437,357114,0.916,Europe
3,60.665,2167744,301336,0.873,Europe
4,127.061,4602367,377930,0.891,Asia
5,64.511,2950039,242495,0.907,Europe
6,318.523,17348075,9525067,0.915,America


In [None]:
df.index = [
    'Canada',
    'France',
    'Germany',
    'Italy',
    'Japan',
    'United Kingdom',
    'United States',
]

In [None]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,35.467,1785387,9984670,0.913,America
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Italy,60.665,2167744,301336,0.873,Europe
Japan,127.061,4602367,377930,0.891,Asia
United Kingdom,64.511,2950039,242495,0.907,Europe
United States,318.523,17348075,9525067,0.915,America


In [None]:
df.columns

Index(['Population', 'GDP', 'Surface Area', 'HDI', 'Continent'], dtype='object')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7 entries, Canada to United States
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Population    7 non-null      float64
 1   GDP           7 non-null      int64  
 2   Surface Area  7 non-null      int64  
 3   HDI           7 non-null      float64
 4   Continent     7 non-null      object 
dtypes: float64(2), int64(2), object(1)
memory usage: 336.0+ bytes


In [None]:
df.size

35

In [None]:
df.shape

(7, 5)

In [None]:
df.describe()

Unnamed: 0,Population,GDP,Surface Area,HDI
count,7.0,7.0,7.0,7.0
mean,107.302571,5080248.0,3061327.0,0.900429
std,97.24997,5494020.0,4576187.0,0.016592
min,35.467,1785387.0,242495.0,0.873
25%,62.308,2500716.0,329225.0,0.8895
50%,64.511,2950039.0,377930.0,0.907
75%,104.0005,4238402.0,5082873.0,0.914
max,318.523,17348080.0,9984670.0,0.916


In [None]:
df.dtypes.value_counts()

float64    2
int64      2
object     1
dtype: int64

In [None]:
#To select using Index(set by us)
df.loc["Canada"]

Population       35.467
GDP             1785387
Surface Area    9984670
HDI               0.913
Continent       America
Name: Canada, dtype: object

In [None]:
#To select by index like in python
df.iloc[1]

Population       63.951
GDP             2833687
Surface Area     640679
HDI               0.888
Continent        Europe
Name: France, dtype: object

In [None]:
#Just like 2D Matrix
print(df["Population"])
df["Population"]["Canada"]

Canada             35.467
France             63.951
Germany            80.940
Italy              60.665
Japan             127.061
United Kingdom     64.511
United States     318.523
Name: Population, dtype: float64


35.467

In [None]:
df.loc["France":"Italy", "Population"]

France     63.951
Germany    80.940
Italy      60.665
Name: Population, dtype: float64

In [None]:
#Multi Indexing
df.iloc[[0,1,-1]]

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,35.467,1785387,9984670,0.913,America
France,63.951,2833687,640679,0.888,Europe
United States,318.523,17348075,9525067,0.915,America


In [None]:
df.iloc[1:3,3]

France     0.888
Germany    0.916
Name: HDI, dtype: float64

In [None]:
df.loc[df["Population"]>70]

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Germany,80.94,3874437,357114,0.916,Europe
Japan,127.061,4602367,377930,0.891,Asia
United States,318.523,17348075,9525067,0.915,America


In [None]:
df.loc[df["Population"]>70, "GDP"]

Germany           3874437
Japan             4602367
United States    17348075
Name: GDP, dtype: int64

In [None]:
df.drop(["Canada"])

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Italy,60.665,2167744,301336,0.873,Europe
Japan,127.061,4602367,377930,0.891,Asia
United Kingdom,64.511,2950039,242495,0.907,Europe
United States,318.523,17348075,9525067,0.915,America


In [None]:
df.drop(columns=["Population", "HDI"])

Unnamed: 0,GDP,Surface Area,Continent
Canada,1785387,9984670,America
France,2833687,640679,Europe
Germany,3874437,357114,Europe
Italy,2167744,301336,Europe
Japan,4602367,377930,Asia
United Kingdom,2950039,242495,Europe
United States,17348075,9525067,America


In [None]:
df.drop(["Canada"], axis="rows")

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Italy,60.665,2167744,301336,0.873,Europe
Japan,127.061,4602367,377930,0.891,Asia
United Kingdom,64.511,2950039,242495,0.907,Europe
United States,318.523,17348075,9525067,0.915,America


In [None]:
df.drop(["Canada"], axis=0)

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Italy,60.665,2167744,301336,0.873,Europe
Japan,127.061,4602367,377930,0.891,Asia
United Kingdom,64.511,2950039,242495,0.907,Europe
United States,318.523,17348075,9525067,0.915,America


In [None]:
df[['Population', 'GDP']] / 100

Unnamed: 0,Population,GDP
Canada,0.35467,17853.87
France,0.63951,28336.87
Germany,0.8094,38744.37
Italy,0.60665,21677.44
Japan,1.27061,46023.67
United Kingdom,0.64511,29500.39
United States,3.18523,173480.75


In [None]:
crisis = pd.Series([-1_000_000, -0.3], index=['GDP', 'HDI'])
crisis

GDP   -1000000.0
HDI         -0.3
dtype: float64

In [None]:
df[['GDP', 'HDI']]

Unnamed: 0,GDP,HDI
Canada,1785387,0.913
France,2833687,0.888
Germany,3874437,0.916
Italy,2167744,0.873
Japan,4602367,0.891
United Kingdom,2950039,0.907
United States,17348075,0.915


In [None]:
df[['GDP', 'HDI']] + crisis

Unnamed: 0,GDP,HDI
Canada,785387.0,0.613
France,1833687.0,0.588
Germany,2874437.0,0.616
Italy,1167744.0,0.573
Japan,3602367.0,0.591
United Kingdom,1950039.0,0.607
United States,16348075.0,0.615


In [None]:
lang = pd.Series(["Frech", "German", "Italian"],
                index=["France","Germany","Italy"],
                name = "L")

In [None]:
lang

In [None]:
df["Languages"] = lang

In [None]:
df

In [None]:
df.rename(
    columns={
        'HDI': 'Human Development Index',
        'Anual Popcorn Consumption': 'APC'
    }, index={
        'United States': 'USA',
        'United Kingdom': 'UK',
        'Argentina': 'AR'
    })
#Here Argentina is not in df, still it won't give an error. Will just change all the names who matches to df

In [None]:
df.rename(index=str.upper)

In [None]:
df.rename(index=lambda x: x.lower())

In [None]:
df.drop(columns='Languages', inplace=True)


In [None]:
df.append(pd.Series({
    'Population': 3,
    'GDP': 5
}, name='China'))

In [None]:
df

In [None]:
df.loc['China'] = pd.Series({'Population': 1_400_000_000, 'Continent': 'Asia'})

In [None]:
df

In [None]:
df.drop('China', inplace=True)

In [None]:
df.reset_index()


In [None]:
df.set_index('Population')

In [None]:
df['GDP Per Capita'] = df['GDP'] / df['Population']
df

In [None]:
population = df["Population"]
population.min(), population.max()

In [None]:
population.sum()

In [None]:
population.sum() / len(population)

In [None]:
population.mean()

In [None]:
population.std()
population.median()
population.describe()

In [None]:
population.quantile(.25)

In [None]:
population.quantile([.2, .4, .6, .8, 1])

In [None]:
#Now we will see how to use readymade datasets and matplotlib
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv("../input/btc-10-years-price-data-20122021/BTC 2012-2021.csv")
#Header= can be used to assign header if not exist by default

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.tail()

In [None]:
df.dtypes

In [None]:
df.rename(columns = {"Date":"Timestamp"}, inplace=True)
df

In [None]:
df['Timestamp'] = pd.to_datetime(df['Timestamp'])

In [None]:
df

In [None]:
#IMP - read_csv function can take multiple commands and with 5-6 lines of code in one cell;
#all the process above done can be done

#isna,notna, fillna, replace, duplicated, drop_na, drop_duplicated, contains, 
#split etc.. are very useful functions in data cleaning