In [1]:
# importing libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
drinks = pd.read_csv('http://bit.ly/drinksbycountry')

drinks.head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,Asia
1,Albania,89,132,54,4.9,Europe
2,Algeria,25,0,14,0.7,Africa
3,Andorra,245,138,312,12.4,Europe
4,Angola,217,57,45,5.9,Africa


In [3]:
drinks.index

RangeIndex(start=0, stop=193, step=1)

In [4]:
pd.read_table('http://bit.ly/movieusers', header=None, sep='|').head()

Unnamed: 0,0,1,2,3,4
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


### Why Index?

- Identification
- Selection
- Alignment

In [5]:
drinks.head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,Asia
1,Albania,89,132,54,4.9,Europe
2,Algeria,25,0,14,0.7,Africa
3,Andorra,245,138,312,12.4,Europe
4,Angola,217,57,45,5.9,Africa


In [6]:
drinks[drinks.continent == 'South America']

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
6,Argentina,193,25,221,8.3,South America
20,Bolivia,167,41,8,3.8,South America
23,Brazil,245,145,16,7.2,South America
35,Chile,130,124,172,7.6,South America
37,Colombia,159,76,3,4.2,South America
52,Ecuador,162,74,3,4.2,South America
72,Guyana,93,302,1,7.1,South America
132,Paraguay,213,117,74,7.3,South America
133,Peru,163,160,21,6.1,South America
163,Suriname,128,178,7,5.6,South America


Here we find that indexes are retained. They are not re-numbered

So index is for ** Identification **

In [None]:
# drinks.loc[23]['beer_servings']

This gives beer_servings for Brazil. But we do not know that Brazil is row no 23

So we need an index on Brazil to look it up better

In [15]:
# drinks.loc[23]['beer_servings']

# drinks.set_index('country', inplace=True)

drinks.head()

print(drinks.loc['Brazil', 'beer_servings'])

245


#### Changing the index

#### Ensure ur index has a name before resetting

In [16]:
drinks.reset_index(inplace=True)

In [17]:
drinks.head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,Asia
1,Albania,89,132,54,4.9,Europe
2,Algeria,25,0,14,0.7,Africa
3,Andorra,245,138,312,12.4,Europe
4,Angola,217,57,45,5.9,Africa


#### Indexes of Series come from DF

In [19]:
drinks.continent.head()

0      Asia
1    Europe
2    Africa
3    Europe
4    Africa
Name: continent, dtype: object

In [20]:
drinks.set_index('country', inplace=True)

In [21]:
drinks.continent.head()

country
Afghanistan      Asia
Albania        Europe
Algeria        Africa
Andorra        Europe
Angola         Africa
Name: continent, dtype: object

#### Now the index for this Series is the country

In [27]:
print(drinks.continent.value_counts())

print(drinks.continent.value_counts().index)

print(drinks.continent.value_counts().values)

# Similar working as .loc()

print(drinks.continent.value_counts()['Africa'])

Africa           53
Europe           45
Asia             44
North America    23
Oceania          16
South America    12
Name: continent, dtype: int64
Index(['Africa', 'Europe', 'Asia', 'North America', 'Oceania',
       'South America'],
      dtype='object')
[53 45 44 23 16 12]
53


#### Sorting

In [29]:
print(drinks.continent.value_counts().sort_values())

print(drinks.continent.value_counts().sort_index())

South America    12
Oceania          16
North America    23
Asia             44
Europe           45
Africa           53
Name: continent, dtype: int64
Africa           53
Asia             44
Europe           45
North America    23
Oceania          16
South America    12
Name: continent, dtype: int64


### Alignment

Creating a dataset

In [31]:
people = pd.Series([3000000, 85000], index=['Albamia', 'Andorra'], name='population')

people

Albamia    3000000
Andorra      85000
Name: population, dtype: int64

#### Get the total beer servings for each country

In [32]:
drinks.beer_servings * people

Afghanistan                    NaN
Albamia                        NaN
Albania                        NaN
Algeria                        NaN
Andorra                 20825000.0
Angola                         NaN
Antigua & Barbuda              NaN
Argentina                      NaN
Armenia                        NaN
Australia                      NaN
Austria                        NaN
Azerbaijan                     NaN
Bahamas                        NaN
Bahrain                        NaN
Bangladesh                     NaN
Barbados                       NaN
Belarus                        NaN
Belgium                        NaN
Belize                         NaN
Benin                          NaN
Bhutan                         NaN
Bolivia                        NaN
Bosnia-Herzegovina             NaN
Botswana                       NaN
Brazil                         NaN
Brunei                         NaN
Bulgaria                       NaN
Burkina Faso                   NaN
Burundi             

#### These 2 datas had a shared index.So they could automatically align the data and give the correct output

#### We can also concatenate these data

In [None]:
pd.concat([drinks, people])