## 4.6 Merging and exporting data

### This script contains the following points:

#### 1. Create data to experiment on
#### 2. Concatenate dataframes
#### 3. Append data
#### 4. Merge data

In [1]:
# Import libraries

import pandas as pd
import numpy as np
import os

#### 1. Create data to experiment on

In [2]:
# Define a dictionary containing January 2020 data 

data1 = {'customer_id':['6732', '767', '890', '635'], 
        'month':['Jan-20', 'Jan-20', 'Jan-20', 'Jan-20'], 
        'purchased_meat':[0, 13, 3, 4], 
        'purchased_alcohol':[1, 2, 10, 0],
        'purchased_snacks': [10, 5, 1, 7]} 

In [3]:
# Define a dictionary containing February 2020 data 

data2 = {'customer_id':['6732', '767', '890', '635'], 
        'month':['Feb-20', 'Feb-20', 'Feb-20', 'Feb-20'], 
        'purchased_meat':[0, 10, 5, 3], 
        'purchased_alcohol':[2, 4, 14, 0],
        'purchased_snacks': [15, 3, 2, 6]} 

In [4]:
# Convert the dictionary into Dataframe
df = pd.DataFrame(data1,index=[0,1,2,3])
df_1 = pd.DataFrame(data2,index=[0,1,2,3])

In [5]:
df

Unnamed: 0,customer_id,month,purchased_meat,purchased_alcohol,purchased_snacks
0,6732,Jan-20,0,1,10
1,767,Jan-20,13,2,5
2,890,Jan-20,3,10,1
3,635,Jan-20,4,0,7


In [6]:
df_1

Unnamed: 0,customer_id,month,purchased_meat,purchased_alcohol,purchased_snacks
0,6732,Feb-20,0,2,15
1,767,Feb-20,10,4,3
2,890,Feb-20,5,14,2
3,635,Feb-20,3,0,6


#### 2. Concatenate dataframes

In [7]:
# Create a list that contains our dataframes
frames = [df, df_1]

In [8]:
# Check the output
frames

[  customer_id   month  purchased_meat  purchased_alcohol  purchased_snacks
 0        6732  Jan-20               0                  1                10
 1         767  Jan-20              13                  2                 5
 2         890  Jan-20               3                 10                 1
 3         635  Jan-20               4                  0                 7,
   customer_id   month  purchased_meat  purchased_alcohol  purchased_snacks
 0        6732  Feb-20               0                  2                15
 1         767  Feb-20              10                  4                 3
 2         890  Feb-20               5                 14                 2
 3         635  Feb-20               3                  0                 6]

In [9]:
# Concatenate the dataframes using default options
df_concat = pd.concat(frames)

In [10]:
# Check the output
df_concat

Unnamed: 0,customer_id,month,purchased_meat,purchased_alcohol,purchased_snacks
0,6732,Jan-20,0,1,10
1,767,Jan-20,13,2,5
2,890,Jan-20,3,10,1
3,635,Jan-20,4,0,7
0,6732,Feb-20,0,2,15
1,767,Feb-20,10,4,3
2,890,Feb-20,5,14,2
3,635,Feb-20,3,0,6


In [11]:
# Concatenate the dataframes using the axis = 1 (to wide format)
df_concat = pd.concat(frames, axis = 1)

In [12]:
# Check the output
df_concat

Unnamed: 0,customer_id,month,purchased_meat,purchased_alcohol,purchased_snacks,customer_id.1,month.1,purchased_meat.1,purchased_alcohol.1,purchased_snacks.1
0,6732,Jan-20,0,1,10,6732,Feb-20,0,2,15
1,767,Jan-20,13,2,5,767,Feb-20,10,4,3
2,890,Jan-20,3,10,1,890,Feb-20,5,14,2
3,635,Jan-20,4,0,7,635,Feb-20,3,0,6


#### 4. Merge Data

In [13]:
# Create third dataframe
data3 = {'customer_id':['6732', '767', '890', '635'],
    'month':['Jan-20', 'Jan-20', 'Jan-20', 'Jan-20'],
    'days_purchased_on':[0, 10, 4, 1]}

In [14]:
df_2 = pd.DataFrame(data3,index=[0,1,2,3])

In [15]:
# Check the output
df_2

Unnamed: 0,customer_id,month,days_purchased_on
0,6732,Jan-20,0
1,767,Jan-20,10
2,890,Jan-20,4
3,635,Jan-20,1


In [16]:
# Merge df and df_2 using customer_id as a key 
df_merged = df.merge(df_2, on = ['customer_id','month'],indicator=True)

In [17]:
# Check the output
df_merged

Unnamed: 0,customer_id,month,purchased_meat,purchased_alcohol,purchased_snacks,days_purchased_on,_merge
0,6732,Jan-20,0,1,10,0,both
1,767,Jan-20,13,2,5,10,both
2,890,Jan-20,3,10,1,4,both
3,635,Jan-20,4,0,7,1,both


In [18]:
# Frequency check
df_merged['_merge'].value_counts()

_merge
both          4
left_only     0
right_only    0
Name: count, dtype: int64

In [19]:
# Test merge without overwriting
pd.merge(df,df_2, on = ['customer_id','month'], indicator = True)

Unnamed: 0,customer_id,month,purchased_meat,purchased_alcohol,purchased_snacks,days_purchased_on,_merge
0,6732,Jan-20,0,1,10,0,both
1,767,Jan-20,13,2,5,10,both
2,890,Jan-20,3,10,1,4,both
3,635,Jan-20,4,0,7,1,both


#### Importing DFs

In [20]:
path = r'C:\Users\Marvin\Instacart Basket Analysis'

In [22]:
df_ords_prior = pd.read_csv(os.path.join(path,'02 Data','Original Data','orders_products_prior.csv'),index_col = False)