In [1]:
import pandas as pd
import numpy as np

pd.options.display.float_format = '{:..1f}'.format
pd.options.display.max_rows = 100

from IPython.display import display

In [2]:
path = 'datasets/employee_list.parquet'
df = pd.read_parquet(path)
df.head()

Unnamed: 0,employee_number,name,company,country,dob,age,department,salary,has_parking_space
0,897028,Kenneth Jensen,Wilson and Sons,India,1983-07-03,38,Management,124790,False
1,463979,Sarah Anderson,"Hernandez, Cunningham and Clark",India,1980-08-09,41,Consulting,103122,True
2,388446,Tracie Rollins,"Hernandez, Cunningham and Clark",Cayman Islands,1987-07-29,34,Consulting,119072,False
3,267447,Seth Smith,Spears-Brown,Germany,1969-03-04,52,System Architect,115653,False
4,401300,Katherine Fields,"Hernandez, Cunningham and Clark",Venezuela,1980-01-26,42,Finance,119412,False


Find number of rows

In [3]:
print(df.shape[0])
print(df.index.size)

100
100


## Append
For stacking vertically

In [4]:
df1 = df.iloc[:2,:].set_index('employee_number')
df2 = df.iloc[2:4,].set_index('employee_number')
df1.append(df2)

Unnamed: 0_level_0,name,company,country,dob,age,department,salary,has_parking_space
employee_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
897028,Kenneth Jensen,Wilson and Sons,India,1983-07-03,38,Management,124790,False
463979,Sarah Anderson,"Hernandez, Cunningham and Clark",India,1980-08-09,41,Consulting,103122,True
388446,Tracie Rollins,"Hernandez, Cunningham and Clark",Cayman Islands,1987-07-29,34,Consulting,119072,False
267447,Seth Smith,Spears-Brown,Germany,1969-03-04,52,System Architect,115653,False


Appending a new dataframe that are missing a column results in NaNs 

In [5]:
df1 = df.iloc[:2,:].set_index('employee_number')

# df2 will miss column 'has_parking_space'
df2 = df.iloc[2:4,:-1].set_index('employee_number')

df1.append(df2)

Unnamed: 0_level_0,name,company,country,dob,age,department,salary,has_parking_space
employee_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
897028,Kenneth Jensen,Wilson and Sons,India,1983-07-03,38,Management,124790,False
463979,Sarah Anderson,"Hernandez, Cunningham and Clark",India,1980-08-09,41,Consulting,103122,True
388446,Tracie Rollins,"Hernandez, Cunningham and Clark",Cayman Islands,1987-07-29,34,Consulting,119072,
267447,Seth Smith,Spears-Brown,Germany,1969-03-04,52,System Architect,115653,


## Concat
- For stacking many horizontally or vertically
- Simple inner/outer joins on Indexes

#### Concat along the rows

In [6]:
len(df1.columns)

8

In [7]:
# df1 is missing column 'country'
df1 = df.iloc[:2,[0, 1, 2, 4, 5, 6, 7, 8]].set_index('employee_number')

# df2 is missing column 'has_parking_space'
df2 = df.iloc[2:4,:-1].set_index('employee_number')

pd.concat([df1,df2])

Unnamed: 0_level_0,name,company,dob,age,department,salary,has_parking_space,country
employee_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
897028,Kenneth Jensen,Wilson and Sons,1983-07-03,38,Management,124790,False,
463979,Sarah Anderson,"Hernandez, Cunningham and Clark",1980-08-09,41,Consulting,103122,True,
388446,Tracie Rollins,"Hernandez, Cunningham and Clark",1987-07-29,34,Consulting,119072,,Cayman Islands
267447,Seth Smith,Spears-Brown,1969-03-04,52,System Architect,115653,,Germany


#### Concat along the columns

In [8]:
df_copy = df.copy(deep=True)
df_copy.set_index('employee_number', inplace=True)

# df1 contains the first 4 columns
df1 = df_copy.iloc[:5, :4]

# df2 contains the last four columns
df2 = df_copy.iloc[:5, 4:]

pd.concat([df1, df2], axis='columns').head()

Unnamed: 0_level_0,name,company,country,dob,age,department,salary,has_parking_space
employee_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
897028,Kenneth Jensen,Wilson and Sons,India,1983-07-03,38,Management,124790,False
463979,Sarah Anderson,"Hernandez, Cunningham and Clark",India,1980-08-09,41,Consulting,103122,True
388446,Tracie Rollins,"Hernandez, Cunningham and Clark",Cayman Islands,1987-07-29,34,Consulting,119072,False
267447,Seth Smith,Spears-Brown,Germany,1969-03-04,52,System Architect,115653,False
401300,Katherine Fields,"Hernandez, Cunningham and Clark",Venezuela,1980-01-26,42,Finance,119412,False


### Concat series

In [9]:
ser1 = pd.Series([10, 20, 30], index=list('ABC'))
ser2 = pd.Series([40, 50, 60], index=list('ABC'))
pd.concat([ser1, ser2])

A    10
B    20
C    30
A    40
B    50
C    60
dtype: int64

In [10]:
ser1 = pd.Series([10, 20, 30], index=list('ABC'))
ser2 = pd.Series([40, 50, 60], index=list('ABC'))
pd.concat([ser1, ser2], ignore_index=True)

0    10
1    20
2    30
3    40
4    50
5    60
dtype: int64

## Join
- inner/outer/left/right joins on Indexes

## Merge
- Many joins on multiple columns

### Example 2

In [87]:
# employee dataframe
use_cols = ['employee_number', 'name', 'company', 'department']
employees = df[use_cols].sample(3)

# employee system logins dataframe
from faker import Faker
fake = Faker()
logins = pd.DataFrame(columns=['employee_number', 'system_login'])
for number in employees.employee_number.tolist():
    for _ in range(np.random.randint(low=2, high=5)):
        datetime = fake.date_time_between(start_date='-5d', end_date='now')
        d = {'employee_number': number, 'system_login': datetime}
        logins = logins.append(d, ignore_index=True)
logins = logins.sort_values('system_login').reset_index(drop=True)

In [88]:
print('Number of rows in employees dataframe: ', employees.shape[0])
employees

Number of rows in employees dataframe:  3


Unnamed: 0,employee_number,name,company,department
60,920539,Robert Wang,"Hernandez, Cunningham and Clark",Finance
42,907813,Jennifer Thompson,"Hernandez, Cunningham and Clark",System Architect
84,567812,Robin Mcmahon,Wilson and Sons,Management


In [89]:
print('Number of rows in logins dataframe: ', logins.shape[0])
logins.head(5)

Number of rows in logins dataframe:  8


Unnamed: 0,employee_number,system_login
0,907813,2022-01-30 21:57:51
1,920539,2022-01-30 23:38:15
2,567812,2022-01-31 10:57:22
3,567812,2022-01-31 19:24:38
4,907813,2022-02-01 21:37:29


In [90]:
merged = pd.merge(employees, logins, left_on='employee_number', right_on='employee_number')
print('Number of rows in merged dataframe: ', merged.shape[0])
merged

Number of rows in merged dataframe:  8


Unnamed: 0,employee_number,name,company,department,system_login
0,920539,Robert Wang,"Hernandez, Cunningham and Clark",Finance,2022-01-30 23:38:15
1,920539,Robert Wang,"Hernandez, Cunningham and Clark",Finance,2022-02-02 05:27:39
2,907813,Jennifer Thompson,"Hernandez, Cunningham and Clark",System Architect,2022-01-30 21:57:51
3,907813,Jennifer Thompson,"Hernandez, Cunningham and Clark",System Architect,2022-02-01 21:37:29
4,907813,Jennifer Thompson,"Hernandez, Cunningham and Clark",System Architect,2022-02-01 22:54:46
5,907813,Jennifer Thompson,"Hernandez, Cunningham and Clark",System Architect,2022-02-02 21:09:07
6,567812,Robin Mcmahon,Wilson and Sons,Management,2022-01-31 10:57:22
7,567812,Robin Mcmahon,Wilson and Sons,Management,2022-01-31 19:24:38


### Example 3

In [91]:
employees = employees.set_index('employee_number')

In [92]:
merged = employees.merge(logins, how='left', left_index=True, right_on='employee_number')
merged

Unnamed: 0,name,company,department,employee_number,system_login
1,Robert Wang,"Hernandez, Cunningham and Clark",Finance,920539,2022-01-30 23:38:15
6,Robert Wang,"Hernandez, Cunningham and Clark",Finance,920539,2022-02-02 05:27:39
0,Jennifer Thompson,"Hernandez, Cunningham and Clark",System Architect,907813,2022-01-30 21:57:51
4,Jennifer Thompson,"Hernandez, Cunningham and Clark",System Architect,907813,2022-02-01 21:37:29
5,Jennifer Thompson,"Hernandez, Cunningham and Clark",System Architect,907813,2022-02-01 22:54:46
7,Jennifer Thompson,"Hernandez, Cunningham and Clark",System Architect,907813,2022-02-02 21:09:07
2,Robin Mcmahon,Wilson and Sons,Management,567812,2022-01-31 10:57:22
3,Robin Mcmahon,Wilson and Sons,Management,567812,2022-01-31 19:24:38


In [93]:
logins.shape

(8, 2)

In [80]:
merged.shape

(5, 5)

In [84]:
employees.merge(logins, left_index=True, right_on='employee_number').shape[0]

5

In [44]:
from faker import Faker
fake = Faker()
logins = pd.DataFrame(columns=['employee_number', 'system_login'])
for number in employees.employee_number.tolist():
    for _ in range(np.random.randint(low=1, high=5)):
        datetime = fake.date_time_between(start_date='-5d', end_date='now')
        d = {'employee_number': number, 'system_login': datetime}
        logins = logins.append(d, ignore_index=True)
logins = logins.sort_values('system_login').reset_index(drop=True)
        
        
    

In [45]:
logins

Unnamed: 0,employee_number,system_login
0,701315,2022-01-29 19:20:45
1,426293,2022-01-29 21:36:53
2,701315,2022-01-30 22:51:33
3,919280,2022-01-31 13:28:15
4,701315,2022-01-31 17:14:29
5,426293,2022-01-31 19:22:19
6,701315,2022-01-31 22:31:17
7,248013,2022-02-01 18:06:17
8,248013,2022-02-02 14:52:59
9,426293,2022-02-02 18:14:49


In [20]:
fake.date_time_between(start_date='-30d', end_date='now')

datetime.datetime(2022, 1, 6, 6, 49, 2)

In [30]:
tbl = pd.DataFrame(columns=['employee_number', 'system_login'])

In [31]:
tbl

Unnamed: 0,employee_number,system_login


In [34]:
tbl = tbl.append({'employee_number': 5, 'system_login': 'today'}, ignore_index=True)

In [35]:
tbl.append({'employee_number': 5, 'system_login': 'today'}, ignore_index=True)

Unnamed: 0,employee_number,system_login
0,5,today
1,5,today
