# Pandas

https://pandas.pydata.org/pandas-docs/stable/index.html

Based on NumPy  
Used for:
- **Cleansing**, **Wrangling** and qucik **Analysis** & **Visualization**
- High **performance** and high **data-compatibility**

### Content:  
1. Series & Data-Frames
1. Missing Datat
1. Aggregation  
  - group by
  - merging, joining, concatinating
1. Operations  
1. I/O


### Remarks
- **inspace** to overwrite DF !!
- DF logic with **&** or **|**


# Series

### Create pd.Series

In [4]:
import numpy as np
import pandas as pd

from numpy.random import randn

In [12]:
labels1 = ["a", "b", "c"]
liste = [17, 15, 13]
liste1 = [10, 20, 30]
arr = np.array(liste1)
dic = {"a": 10, "b": 20, "c": 30}

In [18]:
# pd from List 

print(pd.Series(liste))  # same as: print(pd.Series(data=liste))

# with labels
print(pd.Series(data=liste1, index=labels1))
# pd.Series(liste1, labels1)   # short


0    17
1    15
2    13
dtype: int64
a    10
b    20
c    30
dtype: int64


In [12]:
# pd from Array
print(pd.Series(arr))

# with labels
print(pd.Series(arr, labels1))

0    10
1    20
2    30
dtype: int32
a    10
b    20
c    30
dtype: int32


In [13]:
# pd from dictionary
print(pd.Series(dic))

a    10
b    20
c    30
dtype: int64


In [14]:
# pd with functions 
pd.Series([sum, print, len])

# Remark: Pandas is flexible!

0      <built-in function sum>
1    <built-in function print>
2      <built-in function len>
dtype: object

### Retrieve data from pd.Series

In [20]:
ser1 = pd.Series([1, 2, 3, 4], ["USA", "Germany", "Russia", "Japan"])
ser2 = pd.Series([5, 6, 7, 8], index=["USA", "Italy", "Germany", "France"])
print(ser1)
print(ser2)
ser1["Russia"]

USA        1
Germany    2
Russia     3
Japan      4
dtype: int64
USA        5
Italy      6
Germany    7
France     8
dtype: int64


3

In [19]:
ser1["Russia"]

3

In [21]:
# Adding Sries 
# ... not common "NaN" 
# ...order lexicographical

ser1 + ser2

France     NaN
Germany    9.0
Italy      NaN
Japan      NaN
Russia     NaN
USA        6.0
dtype: float64

# Data Frames

- Create and Drop
- Indexing  
  - Rows / Columns
  - Subsets
- Conditionl Output
- Reset Index
- Multi- Index

In [24]:
np.random.seed(101)

In [57]:
df = pd.DataFrame(randn(5, 4), index='A B C D E'.split(), columns='W X Y Z'.split())
df

Unnamed: 0,W,X,Y,Z
A,1.60678,-1.11571,-1.385379,-1.32966
B,0.04146,-0.411055,-0.771329,0.110477
C,-0.804652,0.253548,0.649148,0.358941
D,-1.080471,0.902398,0.161781,0.833029
E,0.97572,-0.388239,0.783316,-0.708954


In [58]:
df[["W", "Z"]]

Unnamed: 0,W,Z
A,1.60678,-1.32966
B,0.04146,0.110477
C,-0.804652,0.358941
D,-1.080471,0.833029
E,0.97572,-0.708954


In [59]:
df["W"]
print(type(df))
print(type(df["W"]))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>


## Create / Drop (columns, rows)

In [60]:
# Create new Series in Data-Frame

df["new"] = df["W"] + df["Z"]

# Drop column / row
df.drop('new', axis=1)

Unnamed: 0,W,X,Y,Z
A,1.60678,-1.11571,-1.385379,-1.32966
B,0.04146,-0.411055,-0.771329,0.110477
C,-0.804652,0.253548,0.649148,0.358941
D,-1.080471,0.902398,0.161781,0.833029
E,0.97572,-0.388239,0.783316,-0.708954


In [61]:
df

Unnamed: 0,W,X,Y,Z,new
A,1.60678,-1.11571,-1.385379,-1.32966,0.27712
B,0.04146,-0.411055,-0.771329,0.110477,0.151937
C,-0.804652,0.253548,0.649148,0.358941,-0.445711
D,-1.080471,0.902398,0.161781,0.833029,-0.247442
E,0.97572,-0.388239,0.783316,-0.708954,0.266766


In [63]:
# Drop column / row permanent:
df.drop('E', axis=0, inplace=True)
df

Unnamed: 0,W,X,Y,Z,new
A,1.60678,-1.11571,-1.385379,-1.32966,0.27712
B,0.04146,-0.411055,-0.771329,0.110477,0.151937
C,-0.804652,0.253548,0.649148,0.358941,-0.445711
D,-1.080471,0.902398,0.161781,0.833029,-0.247442


## Indexing Rows

In [71]:
df.loc['A']

W      1.606780
X     -1.115710
Y     -1.385379
Z     -1.329660
new    0.277120
Name: A, dtype: float64

In [72]:
df.iloc[2]

W     -0.804652
X      0.253548
Y      0.649148
Z      0.358941
new   -0.445711
Name: C, dtype: float64

## Indexing Subsets (col + row)

In [79]:
df.loc['A', 'X']

# analog with indices
# df.iloc[1, 2]

-1.1157099674628352

In [80]:
df.loc[['A', 'B'], ['W', 'Y']]

Unnamed: 0,W,Y
A,1.60678,-1.385379
B,0.04146,-0.771329


## Conditional Output

In [86]:
df>0

Unnamed: 0,W,X,Y,Z,new
A,True,False,False,False,True
B,True,False,False,True,True
C,False,True,True,True,False
D,False,True,True,True,False


In [85]:
df[df["W"]>0]

Unnamed: 0,W,X,Y,Z,new
A,1.60678,-1.11571,-1.385379,-1.32966,0.27712
B,0.04146,-0.411055,-0.771329,0.110477,0.151937


In [88]:
# Condition for one column
df[df["W"]>0][['Y', 'Z']]

Unnamed: 0,Y,Z
A,-1.385379,-1.32966
B,-0.771329,0.110477


In [100]:
# Condtion for 2+ columns
df[(df["W"]>0) & (df["Y"]<0)]

Unnamed: 0,W,X,Y,Z,new
A,1.60678,-1.11571,-1.385379,-1.32966,0.27712
B,0.04146,-0.411055,-0.771329,0.110477,0.151937


## Reset Index (default / custom)

In [105]:
# index becomes a column
df.reset_index()

Unnamed: 0,index,W,X,Y,Z,new
0,A,1.60678,-1.11571,-1.385379,-1.32966,0.27712
1,B,0.04146,-0.411055,-0.771329,0.110477,0.151937
2,C,-0.804652,0.253548,0.649148,0.358941,-0.445711
3,D,-1.080471,0.902398,0.161781,0.833029,-0.247442


In [108]:
newInd = "AA BB CC DD".split()

df["Staaten"] = newInd
df

Unnamed: 0,W,X,Y,Z,new,Staaten
A,1.60678,-1.11571,-1.385379,-1.32966,0.27712,AA
B,0.04146,-0.411055,-0.771329,0.110477,0.151937,BB
C,-0.804652,0.253548,0.649148,0.358941,-0.445711,CC
D,-1.080471,0.902398,0.161781,0.833029,-0.247442,DD


In [109]:
df.set_index("Staaten")

Unnamed: 0_level_0,W,X,Y,Z,new
Staaten,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AA,1.60678,-1.11571,-1.385379,-1.32966,0.27712
BB,0.04146,-0.411055,-0.771329,0.110477,0.151937
CC,-0.804652,0.253548,0.649148,0.358941,-0.445711
DD,-1.080471,0.902398,0.161781,0.833029,-0.247442


## Multi-Index

In [140]:
outer2 = ['G1 G1 G1 G2 G2 G2'.split()]
print(outer2)
print('Type: ' + str(type(outer2)))

[['G1', 'G1', 'G1', 'G2', 'G2', 'G2']]
Type: <class 'list'>


In [148]:
outer = ['G1', 'G1', 'G1', 'G2', 'G2', 'G2']
innter = [1, 2, 3, 1, 2, 3]
hier_index = list(zip(outer, innter))

print('hier_index: ' + str(hier_index))

hier_index2 = pd.MultiIndex.from_tuples(hier_index)
print(hier_index2)

hier_index: [('G1', 1), ('G1', 2), ('G1', 3), ('G2', 1), ('G2', 2), ('G2', 3)]
MultiIndex([('G1', 1),
            ('G1', 2),
            ('G1', 3),
            ('G2', 1),
            ('G2', 2),
            ('G2', 3)],
           )


In [150]:
df = pd.DataFrame(np.random.randn(6,2), index=hier_index2, columns=['A', 'B'])
df

Unnamed: 0,Unnamed: 1,A,B
G1,1,0.305632,0.243178
G1,2,0.864165,-1.560931
G1,3,-0.251897,-0.57812
G2,1,0.236996,0.20078
G2,2,0.327845,0.674485
G2,3,-0.174057,0.78014


In [163]:
df.loc["G1"].iloc[2]

A   -0.251897
B   -0.578120
Name: 3, dtype: float64

In [169]:
df.loc["G1"].loc[2]

A    0.864165
B   -1.560931
Name: 2, dtype: float64

In [170]:
df.index

MultiIndex([('G1', 1),
            ('G1', 2),
            ('G1', 3),
            ('G2', 1),
            ('G2', 2),
            ('G2', 3)],
           )

In [178]:
df.index.names = ["Group", "Num"]

In [179]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
Group,Num,Unnamed: 2_level_1,Unnamed: 3_level_1
G1,1,0.305632,0.243178
G1,2,0.864165,-1.560931
G1,3,-0.251897,-0.57812
G2,1,0.236996,0.20078
G2,2,0.327845,0.674485
G2,3,-0.174057,0.78014


### Multi-Index **Cross-Section** indexing

In [182]:
# same as loc / iloc BUT wasier to use in MultiIndex scenarios !
df.xs(1, level=1)

Unnamed: 0_level_0,A,B
Group,Unnamed: 1_level_1,Unnamed: 2_level_1
G1,0.305632,0.243178
G2,0.236996,0.20078
