In [25]:
import pandas as pd
import numpy as np

## Pandas Series

### What is a `pd.Series` object?

In [4]:
data = pd.Series([0.25, 0.5, 0.75, 1])
display(data)
print(data.index)
print(data.values)
print(data.dtype)

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64

RangeIndex(start=0, stop=4, step=1)
[0.25 0.5  0.75 1.  ]
float64


### Using `.items()`

In [5]:
for idx, item in data.items():
    print(f"{idx} -- {item}")

0 -- 0.25
1 -- 0.5
2 -- 0.75
3 -- 1.0


### Alternative indexing

In [6]:
data = pd.Series([0.25, 0.5, 0.75, 1.0],
                    index = ['a', 'b', 'c', 'd'])

print(data['a'])

0.25


### Series from a dictionary

In [8]:
population_dict = {
                        'California' : 38332521,
                        'Texas' : 26448193,
                        'New York' : 19651127,
                        'Florida' : 19552860,
                        'Illinois' : 12882135
                    }
population = pd.Series(population_dict)
population['California' : 'New York']

California    38332521
Texas         26448193
New York      19651127
dtype: int64

## Pandas DataFrames

### Creating Pandas DataFrames

#### From Pandas Series

In [11]:
population_dict = {
                        'California' : 38332521,
                        'Texas' : 26448193,
                        'New York' : 19651127,
                        'Florida' : 19552860,
                        'Illinois' : 12882135
                    }
population = pd.Series(population_dict)

area_dict = {
                    'California' : 423967,
                    'Texas' : 695662,
                    'New York' : 141297,
                    'Florida' : 170312,
                    'Illinois' : 149995
                }
area = pd.Series(area_dict)

df_states = pd.DataFrame({
                            'population' : population,
                            'area' : area
                        })

display(df_states)

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


In [12]:
# Renaming a column
df_states_units = df_states.rename({'area' : 'area (sq km)'}, axis = 'columns') # Or axis = 1

### Inspecting your dataframe

In [14]:
print(df_states.index)
print('=========================')
print(df_states.columns)
print('=========================')
print(df_states.values)
print('=========================')
print(df_states['population'])
print('=========================')
print(df_states.loc['Florida'])

Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object')
Index(['population', 'area'], dtype='object')
[[38332521   423967]
 [26448193   695662]
 [19651127   141297]
 [19552860   170312]
 [12882135   149995]]
California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
Name: population, dtype: int64
population    19552860
area            170312
Name: Florida, dtype: int64


### DataFrame as list of rows

In [15]:
rows = [{'population' : 38332521, 'area' : 423967},
            {'population' : 26448193, 'area' : 695662},
            {'population' : 19651127, 'area' : 141297},
            {'population' : 19552860, 'area' : 170312},
            {'population' : 12882135, 'area' : 149995}]

row_index = ['California', 
                'Texas', 
                'New York', 
                'Florida', 
                'Illinois']

df_states = pd.DataFrame(rows, index = row_index)
display(df_states)

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


In [16]:
# Missing values filled by np.nan

rows = [{'population' : 38332521},
            {'population' : 26448193, 'area' : 695662},
            {'area' : 141297},
            {'population' : 19552860, 'area' : 170312},
            {'population' : 12882135, 'area' : 149995}]

row_index = ['California', 
                'Texas', 
                'New York', 
                'Florida', 
                'Illinois']

df_states = pd.DataFrame(rows, index = row_index)
display(df_states)

Unnamed: 0,population,area
California,38332521.0,
Texas,26448193.0,695662.0
New York,,141297.0
Florida,19552860.0,170312.0
Illinois,12882135.0,149995.0


In [55]:
rows = [{'population' : 38332521, 'area' : 423967},
            {'population' : 26448193, 'area' : 695662},
            {'population' : 19651127, 'area' : 141297},
            {'population' : 19552860, 'area' : 170312},
            {'population' : 12882135, 'area' : 149995}]

row_index = ['California', 
                'Texas', 
                'New York', 
                'Florida', 
                'Illinois']

df_states = pd.DataFrame(rows, index = row_index)
display(df_states)

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


### Op basis van een 2D array

In [56]:
row_names = df_states.index
column_names = df_states.columns
values = df_states.values
df_states = pd.DataFrame(values,
                            columns = column_names,
                            index = row_names)
display(df_states)

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


### Indexen zijn verzamelingen met volgorde

In [57]:
indxa = pd.Index([2, 3, 5, 7, 11])
indxb = pd.Index([5, 7, 11, 13, 17])

print(indxa.intersection( indxb))
print(indxa.union( indxb))
print(indxa.difference( indxb))

Int64Index([5, 7, 11], dtype='int64')
Int64Index([2, 3, 5, 7, 11, 13, 17], dtype='int64')
Int64Index([2, 3], dtype='int64')


### Oefening

![image.png](attachment:image.png)

In [58]:
n_rows = 4
n_cols = 5

a = np.arange(n_rows * n_cols)
a = np.reshape(a, (n_rows, n_cols))
df = pd.DataFrame(a,
                index = [f"index{i}" for i in range(1, n_rows + 1)],
                columns = [f"column{j}" for j in range(1, n_cols + 1)])
display(df)

Unnamed: 0,column1,column2,column3,column4,column5
index1,0,1,2,3,4
index2,5,6,7,8,9
index3,10,11,12,13,14
index4,15,16,17,18,19


### Oefening
Maak een dataframe met 5 kolommen voor 5 verschillende verdelingen (bvb Normale verdeling met verscheidene parameters, uniforme verdeling, …) en als waarden 10 000 trekkingen van de verdeling.

In [59]:
sample_size = 10**4
dict_values = {
                'Standard Normal' : np.random.normal(0, 1, sample_size),
                'Uniform on [0, 1]' : np.random.rand(sample_size),
                'Weibull(2)' : np.random.weibull(1, size = sample_size),
                'Beta(1, 2)' : np.random.beta(1, 2, size = sample_size)
            }

df_samples = pd.DataFrame(dict_values)
display(df_samples)

Unnamed: 0,Standard Normal,"Uniform on [0, 1]",Weibull(2),"Beta(1, 2)"
0,3.144862,0.895505,0.115637,0.884397
1,0.509908,0.477440,0.299966,0.351161
2,0.999785,0.947551,0.670087,0.157183
3,0.155257,0.991079,1.566466,0.344372
4,1.033663,0.354902,2.219799,0.624214
...,...,...,...,...
9995,0.296077,0.586808,2.087912,0.201792
9996,-0.392251,0.073720,1.053799,0.418432
9997,0.594125,0.530986,1.550448,0.365748
9998,0.411184,0.921760,0.501438,0.356669


## pd.Series indexeren

In [60]:
data = pd.Series(np.arange(0.25, 1.3, 0.25),
                    index = ['a', 'b', 'c', 'd', 'e'])

print(data['e'])
print(data['a':'c'])
print(data[0:2])
print(data[(data > 0.3) & (data < 0.8)])
print(data[['a', 'e']])

1.25
a    0.25
b    0.50
c    0.75
dtype: float64
a    0.25
b    0.50
dtype: float64
b    0.50
c    0.75
dtype: float64
a    0.25
e    1.25
dtype: float64


In [61]:
data = pd.Series(['a', 'b', 'c'], index = [1, 3, 5])
print(data[1])
print(data[1:3])
print(data.loc[1])
print(data.loc[1:3])
print(data.iloc[1])
print(data.iloc[1:3])

a
3    b
5    c
dtype: object
a
1    a
3    b
dtype: object
b
3    b
5    c
dtype: object


## Making new columns

In [62]:
df_states['density'] = df_states['population'] / df_states['area']
display(df_states)

Unnamed: 0,population,area,density
California,38332521,423967,90.413926
Texas,26448193,695662,38.01874
New York,19651127,141297,139.076746
Florida,19552860,170312,114.806121
Illinois,12882135,149995,85.883763


In [63]:
df_states.T

Unnamed: 0,California,Texas,New York,Florida,Illinois
population,38332520.0,26448190.0,19651130.0,19552860.0,12882140.0
area,423967.0,695662.0,141297.0,170312.0,149995.0
density,90.41393,38.01874,139.0767,114.8061,85.88376


## Pandas DataFrame indexeren

In [64]:
display(df_states.iloc[1:, :2])
display(df_states.loc['Texas':'Illinois', 'population':'area'])
display(df_states.loc[:, 'population' : 'area'])


Unnamed: 0,population,area
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


Unnamed: 0,population,area
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


## Conditioneren van DataFrames

In [65]:
df_states.population >= 26448193

California     True
Texas          True
New York      False
Florida       False
Illinois      False
Name: population, dtype: bool

In [66]:
df_states.loc[df_states.population >= 26448193, :]

Unnamed: 0,population,area,density
California,38332521,423967,90.413926
Texas,26448193,695662,38.01874


In [67]:
df_states_T = df_states.T
df_states_T.loc[:, df_states_T.loc['population', :] >= 26448193]

Unnamed: 0,California,Texas
population,38332520.0,26448190.0
area,423967.0,695662.0
density,90.41393,38.01874


## SettingCopyWithWarning

In [68]:
df_states

Unnamed: 0,population,area,density
California,38332521,423967,90.413926
Texas,26448193,695662,38.01874
New York,19651127,141297,139.076746
Florida,19552860,170312,114.806121
Illinois,12882135,149995,85.883763


In [73]:
# Creating a subset of the DataFrame using chained indexing
subset = df_states[df_states['population'] > 19552870]

# Modifying the subset
subset.loc[:, 'area'] = 100

display(df_states)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset.loc[:, 'area'] = 100


Unnamed: 0,population,area,density
California,38332521,423967,90.413926
Texas,26448193,695662,38.01874
New York,19651127,141297,139.076746
Florida,19552860,170312,114.806121
Illinois,12882135,149995,85.883763


In [70]:
# Creating a subset of the DataFrame using chained indexing
subset = df_states[df_states['population'] > 19552870].copy()

# Modifying the subset
subset.loc['California', 'area'] = 100

## Rename

In [53]:
display(df_states.rename({'area' : 'area (sq km)'}, axis = 1))
display(df_states.rename({'Texas' : 'Wisconsin'}, axis = 0))

Unnamed: 0,population,area (sq km),density
California,38332521.0,,
Texas,26448193.0,695662.0,38.01874
New York,,141297.0,
Florida,19552860.0,170312.0,114.806121
Illinois,12882135.0,149995.0,85.883763


Unnamed: 0,population,area,density
California,38332521.0,,
Wisconsin,26448193.0,695662.0,38.01874
New York,,141297.0,
Florida,19552860.0,170312.0,114.806121
Illinois,12882135.0,149995.0,85.883763


## Universele functies

In [75]:
df_states['area (in sq mtr)'] = df_states['area'] * 1000 ** 2
df_states['population'] = df_states['population'].astype(int)
df_states['density'] = df_states['population'] / df_states['area']

## Index alignment

In [89]:
df_states1 = df_states.iloc[:-1, :].copy()
df_states2 = df_states.iloc[1:, :].copy()
df_states2.loc[:, 'population'] = (df_states2.loc[:, 'population'] +
                                        np.random.randint(-10**4, 10**4, size = df_states2['population'].shape))

df_states1.population + df_states2.population

California           NaN
Florida       39109478.0
Illinois             NaN
New York      39311885.0
Texas         52896432.0
Name: population, dtype: float64

In [91]:
df_states1.population.add(-df_states2.population, fill_value=0)

California    38332521.0
Florida          -3758.0
Illinois     -12876149.0
New York         -9631.0
Texas              -46.0
Name: population, dtype: float64

In [92]:
df_states1.population.add(-df_states2.population).fillna(0)

California       0.0
Florida      -3758.0
Illinois         0.0
New York     -9631.0
Texas          -46.0
Name: population, dtype: float64

In [78]:
np.random.randint(-10**4, 10**4, size = df_states2['population'].shape)

array([-7631, -7572, -7283,   -51])

## Bewerkingen tussen DataFrame & Series

In [94]:
display(df_states)
multiplier = pd.Series([2, 1, 2, 1], index = ['population', 'area', 'density', 'area (in sq mtr)'])
df_states *= multiplier
display(df_states)


Unnamed: 0,population,area,density,area (in sq mtr)
California,38332521,423967,90.413926,423967000000
Texas,26448193,695662,38.01874,695662000000
New York,19651127,141297,139.076746,141297000000
Florida,19552860,170312,114.806121,170312000000
Illinois,12882135,149995,85.883763,149995000000


Unnamed: 0,population,area,density,area (in sq mtr)
California,76665042,423967,180.827852,423967000000
Texas,52896386,695662,76.037481,695662000000
New York,39302254,141297,278.153492,141297000000
Florida,39105720,170312,229.612241,170312000000
Illinois,25764270,149995,171.767526,149995000000


## Aggregeren

In [98]:
display(df_states.mean(axis = 0))
display(df_states.mean(axis = 1))

population          4.674673e+07
area                3.162466e+05
density             1.872797e+02
area (in sq mtr)    3.162466e+11
dtype: float64

California    1.060110e+11
Texas         1.739289e+11
New York      3.533411e+10
Florida       4.258782e+10
Illinois      3.750523e+10
dtype: float64

## Functie uitvoeren: apply

In [99]:
population_in_millions1 = df_states.population.apply(lambda x : x / 10**6)

def fun(x):
    return x / 10**6

population_in_millions2 = df_states.population.apply(fun)

density = df_states.apply(lambda x : x['population'] / x['area'], axis = 1)

## Omgaan met missing values

In [119]:
df_states_mask = df_states.copy()
df_states_mask.iloc[:, :] = np.random.randint(0, 2, df_states_mask.shape).astype(bool)
display(df_states.where(df_states_mask, '#MV'))
display(df_states.where(df_states_mask, '-9999'))
display(df_states.where(df_states_mask, np.nan))

Unnamed: 0,population,area,density,area (in sq mtr)
California,#MV,#MV,#MV,423967000000
Texas,#MV,695662,#MV,695662000000
New York,39302254,#MV,278.153492,#MV
Florida,#MV,170312,229.612241,170312000000
Illinois,#MV,#MV,#MV,149995000000


Unnamed: 0,population,area,density,area (in sq mtr)
California,-9999,-9999,-9999.0,423967000000
Texas,-9999,695662,-9999.0,695662000000
New York,39302254,-9999,278.153492,-9999
Florida,-9999,170312,229.612241,170312000000
Illinois,-9999,-9999,-9999.0,149995000000


Unnamed: 0,population,area,density,area (in sq mtr)
California,,,,423967000000.0
Texas,,695662.0,,695662000000.0
New York,39302254.0,,278.153492,
Florida,,170312.0,229.612241,170312000000.0
Illinois,,,,149995000000.0


In [121]:
vals2 = np.array([1, np.nan, 3, 4])
print(vals2.dtype)

vals3 = pd.Series([None, 1, 'a'])
print(vals3)

float64
0    None
1       1
2       a
dtype: object


In [122]:
type(np.nan)

float

In [123]:
1 + np.nan

nan

In [124]:
0 * np.nan

nan

In [128]:
df_states_nan = df_states.where(df_states_mask, np.nan)
display(df_states_nan.mean(axis = 0, skipna = True))
display(df_states_nan.mean(axis = 0, skipna = False))

population          3.930225e+07
area                4.329870e+05
density             2.538829e+02
area (in sq mtr)    3.599840e+11
dtype: float64

population         NaN
area               NaN
density            NaN
area (in sq mtr)   NaN
dtype: float64

In [129]:
df_states_nan.isnull()

Unnamed: 0,population,area,density,area (in sq mtr)
California,True,True,True,False
Texas,True,False,True,False
New York,False,True,False,True
Florida,True,False,False,False
Illinois,True,True,True,False


In [130]:
df_states_nan.notnull()

Unnamed: 0,population,area,density,area (in sq mtr)
California,False,False,False,True
Texas,False,True,False,True
New York,True,False,True,False
Florida,False,True,True,True
Illinois,False,False,False,True


In [131]:
df_states_nan.fillna(25)

Unnamed: 0,population,area,density,area (in sq mtr)
California,25.0,25.0,25.0,423967000000.0
Texas,25.0,695662.0,25.0,695662000000.0
New York,39302254.0,25.0,278.153492,25.0
Florida,25.0,170312.0,229.612241,170312000000.0
Illinois,25.0,25.0,25.0,149995000000.0


In [132]:
df_states_nan.dropna(axis = 0)

Unnamed: 0,population,area,density,area (in sq mtr)


### Forward and backward fill

In [136]:
df_states_nan.dropna?

[1;31mSignature:[0m
[0mdf_states_nan[0m[1;33m.[0m[0mdropna[0m[1;33m([0m[1;33m
[0m    [0maxis[0m[1;33m:[0m [1;34m'Axis'[0m [1;33m=[0m [1;36m0[0m[1;33m,[0m[1;33m
[0m    [0mhow[0m[1;33m:[0m [1;34m'str'[0m [1;33m=[0m [1;34m'any'[0m[1;33m,[0m[1;33m
[0m    [0mthresh[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0msubset[0m[1;33m:[0m [1;34m'IndexLabel'[0m [1;33m=[0m [1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0minplace[0m[1;33m:[0m [1;34m'bool'[0m [1;33m=[0m [1;32mFalse[0m[1;33m,[0m[1;33m
[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Remove missing values.

See the :ref:`User Guide <missing_data>` for more on which values are
considered missing, and how to work with missing data.

Parameters
----------
axis : {0 or 'index', 1 or 'columns'}, default 0
    Determine if rows or columns which contain missing values are
    removed.

    * 0, or 'index' : Drop rows which contain missing values.
    * 1, o