# Pandas
Pandas are used together with numpy. They form an extension of the numpy ndarray object.
It consists of Series and Dataframe as its key objects

## Pandas Series object
A 1-D array of indexed data

In [1]:
# import the packages
import numpy as np
import pandas as pd

In [2]:
# Create a Series from a list or array
data = pd.Series([0.1, 6.35, 7.0, 13.7, 8.0])
# display the data
print("data:\n", data)

# display the values only
print("values: ", data.values)

# display the indices(as a range)
print("indices: ", data.index)

# access a value
print("data at index 3: ", data[3])

data:
 0     0.10
1     6.35
2     7.00
3    13.70
4     8.00
dtype: float64
values:  [ 0.1   6.35  7.   13.7   8.  ]
indices:  RangeIndex(start=0, stop=5, step=1)
data at index 3:  13.7


In [3]:
# Series with custom indices
custom_data = pd.Series([1, 2, 3, 4, 5], index=['a', 'b', 'c', 'd', 'e'])
custom_data

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [4]:
# A series object can also be created from a dictionary. The indices will be the dict keys.
ages = {
    'kasee': 22,
    'Ken': 27,
    'Kashee': 19,
    'Miro': 21,
    'Phoebe': 20
}

age_data = pd.Series(ages)
age_data

kasee     22
Ken       27
Kashee    19
Miro      21
Phoebe    20
dtype: int64

## The DataFrame Object
Can be created from an array or a dict
format: pd.DataFrame(data, columns='', index='')

In [5]:
# States data
population = {
    'Seattle': 124765,
    'Atlanta': 7459843,
    'Kansas': 124765,
    'Connecticut': 124765,
    'Viginia': 124765,
}

# State areas
area = {
    'Seattle': 475398475,
    'Atlanta': 7589347,
    'Kansas': 985904384,
    'Connecticut': 759874,
    'Viginia': 509438
}

# create a dataframe
states = pd.DataFrame({
    'population': population,
    'area': area
})

states

Unnamed: 0,population,area
Atlanta,7459843,7589347
Connecticut,124765,759874
Kansas,124765,985904384
Seattle,124765,475398475
Viginia,124765,509438


In [6]:
# Accessing individual data
popn = states['population'] # or states.population
popn

Atlanta        7459843
Connecticut     124765
Kansas          124765
Seattle         124765
Viginia         124765
Name: population, dtype: int64

## Data Indexing and Selection
Normal slicing and indexing brings confusion, hence `iloc` and `loc` attribs are used for implicit(normal python list-style) and explicit indexing, respectively

In [7]:
data = pd.Series(['a', 'b', 'c'], index=[1, 2, 3])
data

1    a
2    b
3    c
dtype: object

In [8]:
# Trying to access the data locally
print(data[1]) # explicit index when indexing
print(data[1:3]) # implicit index when slicing

a
2    b
3    c
dtype: object


In [9]:
# To solve this, loc and iloc are used
# Explicit indexing
print(data.loc[1])
print(data.loc[1:3])

# implicit indexing
print(data.iloc[1])
print(data.iloc[1:3])

a
1    a
2    b
3    c
dtype: object
b
2    b
3    c
dtype: object


# An example with csv file

In [10]:
# Read the csv data
births = pd.read_csv('C:/Users/user/PycharmProjects/Python4ds_cohort-1/data/births.csv')
births.head()

Unnamed: 0,year,month,day,gender,births
0,1969,1,1.0,F,4046
1,1969,1,1.0,M,4440
2,1969,1,2.0,F,4454
3,1969,1,2.0,M,4548
4,1969,1,3.0,F,4548


## Pivot table
We create a pivot table and display male and female births as a function of decades

In [11]:
births['decade'] = 10 * (births['year'] // 10)
births.head(10)

Unnamed: 0,year,month,day,gender,births,decade
0,1969,1,1.0,F,4046,1960
1,1969,1,1.0,M,4440,1960
2,1969,1,2.0,F,4454,1960
3,1969,1,2.0,M,4548,1960
4,1969,1,3.0,F,4548,1960
5,1969,1,3.0,M,4994,1960
6,1969,1,4.0,F,4440,1960
7,1969,1,4.0,M,4520,1960
8,1969,1,5.0,F,4192,1960
9,1969,1,5.0,M,4198,1960


In [12]:
births.pivot_table(values='births', index='decade', columns='gender', aggfunc='sum')

gender,F,M
decade,Unnamed: 1_level_1,Unnamed: 2_level_1
1960,1753634,1846572
1970,16263075,17121550
1980,18310351,19243452
1990,19479454,20420553
2000,18229309,19106428


# PANDAS DEEP EXAMPLE
## Using the births data

In [13]:
# Read the csv data into a dataframe
births_df = pd.read_csv('C:/Users/user/PycharmProjects/Python4ds_cohort-1/data/births.csv')
births_df.head(10)

Unnamed: 0,year,month,day,gender,births
0,1969,1,1.0,F,4046
1,1969,1,1.0,M,4440
2,1969,1,2.0,F,4454
3,1969,1,2.0,M,4548
4,1969,1,3.0,F,4548
5,1969,1,3.0,M,4994
6,1969,1,4.0,F,4440
7,1969,1,4.0,M,4520
8,1969,1,5.0,F,4192
9,1969,1,5.0,M,4198


In [14]:
# Select the births in january 1980 first 10 days
births_1980 = births_df[((births_df['year'] == 1980) & (births_df['month'] == 1) & (births_df['day'] <= 10))]
births_1980

Unnamed: 0,year,month,day,gender,births
8386,1980,1,1.0,F,4005
8387,1980,1,1.0,M,4227
8388,1980,1,2.0,F,4371
8389,1980,1,2.0,M,4640
8390,1980,1,3.0,F,4815
8391,1980,1,3.0,M,5087
8392,1980,1,4.0,F,4758
8393,1980,1,4.0,M,5181
8394,1980,1,5.0,F,4265
8395,1980,1,5.0,M,4426


In [15]:
# evaluate basic computations on that data
births_1980.describe()

Unnamed: 0,year,month,day,births
count,20.0,20.0,20.0,20.0
mean,1980.0,1.0,5.5,4648.3
std,0.0,0.0,2.946898,362.23213
min,1980.0,1.0,1.0,4005.0
25%,1980.0,1.0,3.0,4344.5
50%,1980.0,1.0,5.5,4760.5
75%,1980.0,1.0,8.0,4848.5
max,1980.0,1.0,10.0,5181.0


In [16]:
# Create a pivot table with birts totals for all genders
gender_table = pd.pivot_table(births, index=['gender'], values='births', aggfunc=[np.sum,np.mean])
gender_table

Unnamed: 0_level_0,sum,mean
Unnamed: 0_level_1,births,births
gender,Unnamed: 1_level_2,Unnamed: 2_level_2
F,74035823,9521.067773
M,77738555,10003.674559
