# Pandas Fundamentals

## Creating Pandas Series

In [1]:
import pandas as pd
import numpy as np

In [2]:
# this is index array
patients = [0, 1, 2, 3]
# this is information array
effective = [True, True, False, False]

effective_series = pd.Series(effective, index=patients)

In [3]:
effective_series.head()

0     True
1     True
2    False
3    False
dtype: bool

In [4]:
# index array can also have strings, floating-point numbers and generic(hashables)
patients = ['a', 'b', 'c', 'd']
# this is information array
effective = [True, True, False, False]

effective_series = pd.Series(effective, index=patients)

In [5]:
effective_series.head()

a     True
b     True
c    False
d    False
dtype: bool

## Creating Pandas DataFrame

In [6]:
# creating index
patients = ["a", "b", "c", "d"]
# creating data
columns = {
    "sys_initial": [120, 126, 130, 115],
    "dia_initial": [75, 85, 90, 87],
    "sys_final": [115, 123, 130, 118],
    "dia_final": [70, 82, 92, 87]
}
# creating dataframe
df = pd.DataFrame(columns, index=patients)

In [7]:
df.head()

Unnamed: 0,sys_initial,dia_initial,sys_final,dia_final
a,120,75,115,70
b,126,85,123,82
c,130,90,130,92
d,115,87,118,87


In [8]:
# creating dataframe from pd-series
patients = ["a", "b", "c", "d"]
# creating data
columns = {
    "sys_initial": pd.Series([120, 126, 130, 115], index=patients),
    "dia_initial": pd.Series([75, 85, 90, 87], index=patients),
    "sys_final": pd.Series([115, 123, 130, 118], index=patients),
    "dia_final": pd.Series([70, 82, 92, 87], index=patients)
}
# creating dataframe
df = pd.DataFrame(columns)

In [9]:
df.head()

Unnamed: 0,sys_initial,dia_initial,sys_final,dia_final
a,120,75,115,70
b,126,85,123,82
c,130,90,130,92
d,115,87,118,87


## Indexing series and dataframe objects

In [10]:
# Retrieving data from pd.series by key
effective_series.loc["a"]

True

In [11]:
# Retrieving data from pd.series by index
effective_series.iloc[0]

True

In [12]:
# Retrieving data from pd.dataframe by key
df.loc["a"]
# This will return a pd.series

sys_initial    120
dia_initial     75
sys_final      115
dia_final       70
Name: a, dtype: int64

In [13]:
# Retrieving data from pd.dataframe by index
df.iloc[0]

sys_initial    120
dia_initial     75
sys_final      115
dia_final       70
Name: a, dtype: int64

In [14]:
# Retrieving a specific cell by keys
df.loc["a", "sys_initial"]

120

In [15]:
# Retrieving a specific cell by keys. second method
df.loc["a"].loc["sys_initial"]

120

In [16]:
# Retrieving a specific cell by index
df.iloc[0, 1]

75

In [17]:
# Retrieving a specific cell by index. second method
df.iloc[0].iloc[1]

75

In [18]:
# Retrieving a column by column attribute
df.sys_initial

a    120
b    126
c    130
d    115
Name: sys_initial, dtype: int64

In [19]:
# or 
df["sys_initial"]

a    120
b    126
c    130
d    115
Name: sys_initial, dtype: int64

In [20]:
# or
df[df.columns[2]]

a    115
b    123
c    130
d    118
Name: sys_final, dtype: int64

In [21]:
# or
df.iloc[:, 2]

a    115
b    123
c    130
d    118
Name: sys_final, dtype: int64

In [22]:
# Increasing performance by sorting

# Create a series with duplicate index
index = list(range(1000)) + list(range(1000))

# Accessing a normal series is a O(N) operation
series = pd.Series(range(2000), index=index)

In [23]:
# Sorting will improve the look-up scaling to O(log(N))
series.sort_index(inplace=True)

## Database-style operations with pandas

### Mapping

In [24]:
# Logarithm of a series
np.log(df.sys_initial)

a    4.787492
b    4.836282
c    4.867534
d    4.744932
Name: sys_initial, dtype: float64

In [25]:
# Square of a series
df.sys_initial ** 2

a    14400
b    15876
c    16900
d    13225
Name: sys_initial, dtype: int64

In [26]:
# Logarithm of a dataframe
np.log(df)

Unnamed: 0,sys_initial,dia_initial,sys_final,dia_final
a,4.787492,4.317488,4.744932,4.248495
b,4.836282,4.442651,4.812184,4.406719
c,4.867534,4.49981,4.867534,4.521789
d,4.744932,4.465908,4.770685,4.465908


In [27]:
# square of a dataframe
df ** 2

Unnamed: 0,sys_initial,dia_initial,sys_final,dia_final
a,14400,5625,13225,4900
b,15876,7225,15129,6724
c,16900,8100,16900,8464
d,13225,7569,13924,7569


In [28]:
# Element-wise operation on two pd.series
a = pd.Series([1, 2, 3], index=["a", "b", "c"])
b = pd.Series([4, 5, 6], index=["a", "b", "c"])

a + b

a    5
b    7
c    9
dtype: int64

In [29]:
# if there is a mismatch in series the result will be NaN
c = pd.Series([4, 5, 6], index=["a", "b", "d"])

a + c

a    5.0
b    7.0
c    NaN
d    NaN
dtype: float64

In [30]:
# Applying mapping methods on series
def superstar(x):
    return '*' + str(x) + '*'

a.map(superstar)

a    *1*
b    *2*
c    *3*
dtype: object

In [31]:
# Applying mapping method to dataframe
# map and applymap are equivalent
df.applymap(superstar)

Unnamed: 0,sys_initial,dia_initial,sys_final,dia_final
a,*120*,*75*,*115*,*70*
b,*126*,*85*,*123*,*82*
c,*130*,*90*,*130*,*92*
d,*115*,*87*,*118*,*87*


In [32]:
# Applying a mapping method to a single row or column in datafram
# axis = 0 means apply method to columns
# axis = 0 means apply method to rows
# This will return a series
df.apply(superstar, axis=0)

sys_initial    *a    120\nb    126\nc    130\nd    115\nName:...
dia_initial    *a    75\nb    85\nc    90\nd    87\nName: dia...
sys_final      *a    115\nb    123\nc    130\nd    118\nName:...
dia_final      *a    70\nb    82\nc    92\nd    87\nName: dia...
dtype: object

In [33]:
df.apply(superstar, axis=1)

a    *sys_initial    120\ndia_initial     75\nsys_f...
b    *sys_initial    126\ndia_initial     85\nsys_f...
c    *sys_initial    130\ndia_initial     90\nsys_f...
d    *sys_initial    115\ndia_initial     87\nsys_f...
dtype: object

In [34]:
# Using eval method to apply numexpr style operations
df.eval("sys_final - sys_initial")

a   -5
b   -3
c    0
d    3
dtype: int64

In [35]:
# Creating new column using eval method
# if inplace=True it will apply operation directly to original dataframe
df.eval("sys_delta = sys_final - sys_initial", inplace=False)

Unnamed: 0,sys_initial,dia_initial,sys_final,dia_final,sys_delta
a,120,75,115,70,-5
b,126,85,123,82,-3
c,130,90,130,92,0
d,115,87,118,87,3
