# Pandas

https://pandas.pydata.org/pandas-docs/stable/index.html

Based on NumPy  
Used for:
- **Cleansing**, **Wrangling** and qucik **Analysis** & **Visualization**
- High **performance** and high **data-compatibility**

### Content:  

1. Series & Data-Frames
1. Missing Datat
1. Aggregation
    - group by
    - merging, joining, concatinating
1. Operations  
1. I/O


### Remarks
- **inspace** to overwrite DF !!
- DF logic with **&** or **|**


# Series

### Create pd.Series

In [1]:
import numpy as np
import pandas as pd

from numpy.random import randn

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
labels1 = ["a", "b", "c"]
liste = [17, 15, 13]
liste1 = [10, 20, 30]
arr = np.array(liste1)
dic = {"a": 10, "b": 20, "c": 30}

arr
dic

array([10, 20, 30])

{'a': 10, 'b': 20, 'c': 30}

In [4]:
# pd from List 
print(pd.Series(liste))  # same as: print(pd.Series(data=liste))

# with labels
print(pd.Series(data=liste1, index=labels1))
# pd.Series(liste1, labels1)   # short


0    17
1    15
2    13
dtype: int64
a    10
b    20
c    30
dtype: int64


In [5]:
# pd from Array
print(pd.Series(arr))

# with labels
print(pd.Series(arr, labels1))

0    10
1    20
2    30
dtype: int64
a    10
b    20
c    30
dtype: int64


In [6]:
# pd from dictionary
print(pd.Series(dic))

a    10
b    20
c    30
dtype: int64


In [7]:
# pd with functions 
pd.Series([sum, print, len])

# Remark: Pandas is flexible!

0      <built-in function sum>
1    <built-in function print>
2      <built-in function len>
dtype: object

### Retrieve data from pd.Series

In [8]:
ser1 = pd.Series([1, 2, 3, 4], ["USA", "Germany", "Russia", "Japan"])
ser2 = pd.Series([5, 6, 7, 8], index=["USA", "Italy", "Germany", "France"])
print(ser1)
print()

print(ser2)
print()

ser1["Russia"]

USA        1
Germany    2
Russia     3
Japan      4
dtype: int64

USA        5
Italy      6
Germany    7
France     8
dtype: int64



3

In [9]:
ser1["Russia"]

3

In [10]:
# Adding Sries 
# ... not common "NaN" 
# ...order lexicographical

ser1 + ser2

France     NaN
Germany    9.0
Italy      NaN
Japan      NaN
Russia     NaN
USA        6.0
dtype: float64

# Data Frames

- Create and Drop
- Indexing  
  - Rows / Columns
  - Subsets
- Conditionl Output
- Reset Index
- Multi- Index

In [11]:
np.random.seed(101)

In [12]:
df = pd.DataFrame(randn(5, 4), index='A B C D E'.split(), columns='W X Y Z'.split())
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [13]:
df[["W", "Z"]]

Unnamed: 0,W,Z
A,2.70685,0.503826
B,0.651118,0.605965
C,-2.018168,-0.589001
D,0.188695,0.955057
E,0.190794,0.683509


In [14]:
df["W"]
print(type(df))
print(type(df["W"]))

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>


## Create / Drop (columns, rows)

In [15]:
# Create new Series in Data-Frame

df["new"] = df["W"] + df["Z"]
df

# Drop column / row
df.drop('new', axis=1)

Unnamed: 0,W,X,Y,Z,new
A,2.70685,0.628133,0.907969,0.503826,3.210676
B,0.651118,-0.319318,-0.848077,0.605965,1.257083
C,-2.018168,0.740122,0.528813,-0.589001,-2.607169
D,0.188695,-0.758872,-0.933237,0.955057,1.143752
E,0.190794,1.978757,2.605967,0.683509,0.874303


Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [16]:
# Drop column / row permanent:
df.drop('E', axis=0, inplace=True)
df

Unnamed: 0,W,X,Y,Z,new
A,2.70685,0.628133,0.907969,0.503826,3.210676
B,0.651118,-0.319318,-0.848077,0.605965,1.257083
C,-2.018168,0.740122,0.528813,-0.589001,-2.607169
D,0.188695,-0.758872,-0.933237,0.955057,1.143752


## Indexing Rows

In [17]:
# Get Row by Index-Name
df.loc['B']

W      0.651118
X     -0.319318
Y     -0.848077
Z      0.605965
new    1.257083
Name: B, dtype: float64

In [18]:
# Get Row by Given-Index
df.iloc[2]

W     -2.018168
X      0.740122
Y      0.528813
Z     -0.589001
new   -2.607169
Name: C, dtype: float64

## Indexing Subsets (col + row)

In [19]:
df.loc['A', 'X']

# analog with indices
# df.iloc[1, 2]

0.6281327087844596

In [20]:
df.loc[['A', 'B'], ['W', 'Y']]

Unnamed: 0,W,Y
A,2.70685,0.907969
B,0.651118,-0.848077


## Conditional Output

In [21]:
df>0

Unnamed: 0,W,X,Y,Z,new
A,True,True,True,True,True
B,True,False,False,True,True
C,False,True,True,False,False
D,True,False,False,True,True


In [22]:
df[df["W"]>0]

Unnamed: 0,W,X,Y,Z,new
A,2.70685,0.628133,0.907969,0.503826,3.210676
B,0.651118,-0.319318,-0.848077,0.605965,1.257083
D,0.188695,-0.758872,-0.933237,0.955057,1.143752


In [23]:
# Condition for one column
df[df["W"]>0][['Y', 'Z']]

Unnamed: 0,Y,Z
A,0.907969,0.503826
B,-0.848077,0.605965
D,-0.933237,0.955057


In [24]:
# Condtion for 2+ columns
df[(df["W"]>0) & (df["Y"]<0)]

Unnamed: 0,W,X,Y,Z,new
B,0.651118,-0.319318,-0.848077,0.605965,1.257083
D,0.188695,-0.758872,-0.933237,0.955057,1.143752


## Reset Index (default / custom)

In [25]:
# index becomes a column
df.reset_index()

Unnamed: 0,index,W,X,Y,Z,new
0,A,2.70685,0.628133,0.907969,0.503826,3.210676
1,B,0.651118,-0.319318,-0.848077,0.605965,1.257083
2,C,-2.018168,0.740122,0.528813,-0.589001,-2.607169
3,D,0.188695,-0.758872,-0.933237,0.955057,1.143752


In [26]:
newInd = "AA BB CC DD".split()

df["Staaten"] = newInd
df

Unnamed: 0,W,X,Y,Z,new,Staaten
A,2.70685,0.628133,0.907969,0.503826,3.210676,AA
B,0.651118,-0.319318,-0.848077,0.605965,1.257083,BB
C,-2.018168,0.740122,0.528813,-0.589001,-2.607169,CC
D,0.188695,-0.758872,-0.933237,0.955057,1.143752,DD


In [27]:
df.set_index("Staaten")

Unnamed: 0_level_0,W,X,Y,Z,new
Staaten,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AA,2.70685,0.628133,0.907969,0.503826,3.210676
BB,0.651118,-0.319318,-0.848077,0.605965,1.257083
CC,-2.018168,0.740122,0.528813,-0.589001,-2.607169
DD,0.188695,-0.758872,-0.933237,0.955057,1.143752


## Multi-Index

In [28]:
outer2 = ['G1 G1 G1 G2 G2 G2'.split()]
print(outer2)
print('Type: ' + str(type(outer2)))

[['G1', 'G1', 'G1', 'G2', 'G2', 'G2']]
Type: <class 'list'>


In [29]:
outer = ['G1', 'G1', 'G1', 'G2', 'G2', 'G2']
innter = [1, 2, 3, 1, 2, 3]
hier_index = list(zip(outer, innter))

print('hier_index: ' + str(hier_index))

hier_index2 = pd.MultiIndex.from_tuples(hier_index)
print(hier_index2)

hier_index: [('G1', 1), ('G1', 2), ('G1', 3), ('G2', 1), ('G2', 2), ('G2', 3)]
MultiIndex([('G1', 1),
            ('G1', 2),
            ('G1', 3),
            ('G2', 1),
            ('G2', 2),
            ('G2', 3)],
           )


In [30]:
df = pd.DataFrame(np.random.randn(6,2), index=hier_index2, columns=['A', 'B'])
df

Unnamed: 0,Unnamed: 1,A,B
G1,1,0.302665,1.693723
G1,2,-1.706086,-1.159119
G1,3,-0.134841,0.390528
G2,1,0.166905,0.184502
G2,2,0.807706,0.07296
G2,3,0.638787,0.329646


In [31]:
df.loc["G1"].iloc[2]

A   -0.134841
B    0.390528
Name: 3, dtype: float64

In [32]:
df.loc["G1"].loc[2]

A   -1.706086
B   -1.159119
Name: 2, dtype: float64

In [33]:
df.index

MultiIndex([('G1', 1),
            ('G1', 2),
            ('G1', 3),
            ('G2', 1),
            ('G2', 2),
            ('G2', 3)],
           )

In [34]:
df.index.names = ["Group", "Num"]

In [35]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
Group,Num,Unnamed: 2_level_1,Unnamed: 3_level_1
G1,1,0.302665,1.693723
G1,2,-1.706086,-1.159119
G1,3,-0.134841,0.390528
G2,1,0.166905,0.184502
G2,2,0.807706,0.07296
G2,3,0.638787,0.329646


### Multi-Index **Cross-Section** indexing

In [36]:
# same as loc / iloc BUT wasier to use in MultiIndex scenarios !
df.xs(1, level=1)

Unnamed: 0_level_0,A,B
Group,Unnamed: 1_level_1,Unnamed: 2_level_1
G1,0.302665,1.693723
G2,0.166905,0.184502
