# Setup 

In [309]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
from thefuzz import fuzz
from thefuzz import process

# Load data

In [310]:
MRW = pd.read_excel('../data/MRW.xlsx', sheet_name=[0, 1, 2, 3, 4])
MRWT, MRWGP, MRWSKU, MRWU, MRWER = MRW[0], MRW[1], MRW[2], MRW[3], MRW[4]

# MRWT

In [311]:
MRWT

Unnamed: 0,Distributor,ProductName,ProductSKU,ScanDate,UserId
0,B,Mobil Super AIO - Charcoal,Mobil Super AIO - 7L,2023-11-29,EXL_1692444744991
1,B,Mobil Super AIO - Charcoal,Mobil Super AIO - 7L,2023-11-29,EXL_1692444744991
2,B,Mobil Super AIO - Charcoal,Mobil Super AIO - 7L,2023-11-29,EXL_1692444744991
3,B,Mobil Super AIO - Charcoal,Mobil Super AIO - 7L,2023-11-29,EXL_1692444744991
4,B,Mobil Super AIO - Charcoal,Mobil Super AIO - 7L,2023-11-29,EXL_1692444744991
...,...,...,...,...,...
215651,C,Mobil Super FF,Mobil Super FF 4L - 7L,2023-06-19,EXL_1686836958772
215652,C,Mobil Super FF,Mobil Super FF 4L - 7L,2023-06-19,EXL_1686836958772
215653,C,Mobil Super FF,Mobil Super FF 4L - 7L,2023-06-19,EXL_1686836958772
215654,C,Mobil Super FF,Mobil Super FF 4L - 7L,2023-06-15,EXL_1686836958772


In [312]:
MRWT.dtypes

Distributor            object
ProductName            object
ProductSKU             object
ScanDate       datetime64[ns]
UserId                 object
dtype: object

## Check null (general)

In [313]:
MRWT.isna().any()

Distributor    False
ProductName    False
ProductSKU     False
ScanDate       False
UserId         False
dtype: bool

There is indeed duplicated, no need to check

## Distributor

In [314]:
MRWT['Distributor'].unique()

array(['B', 'A', 'C', 'D'], dtype=object)

## ProductName

In [315]:
MRWT['ProductName'].unique()

array(['Mobil Super AIO - Charcoal', 'Mobil Super FF',
       'Mobil Delvac Modern', 'Mobil Super AIO', 'Mobil Delvac Legend',
       'Mobil 1 Gold', 'Silver Large', 'Blue Large', 'Gold Large'],
      dtype=object)

## ProductSKU

In [316]:
len(MRWT['ProductSKU'].unique())
#MRWT.groupby(['ProductName', 'ProductSKU']).count() #!!!

10

## ProductSKU (SKU)

In [317]:
len(MRWSKU['ProductSKU'].unique())

10

## ScanDate

In [318]:
MRWTScanDateMin = MRWT['ScanDate'].min()
MRWTSCanDateMax = MRWT['ScanDate'].max()
print(f"min:{MRWTScanDateMin}, max:{MRWTSCanDateMax}")

min:2023-01-01 00:00:00, max:2023-12-31 00:00:00


## UserId

In [319]:
MRWTUserIdCount = len(MRWT['UserId'].unique())
MRWTUserIdMin = MRWT['UserId'].min()
MRWTUserIdMax = MRWT['UserId'].max()
print(f"Count:{MRWTUserIdCount}, min:{MRWTUserIdMin}, max:{MRWTUserIdMax}")

Count:1428, min:EXL_1569867483497, max:EXL_1703840783686


## OwnerId (Garage Profile)

In [320]:
MRWGPUserIdCount = len(MRWGP['OwnerId'].unique())
MRWGPUserIdMin = MRWGP['OwnerId'].min()
MRWGPUserIdMax = MRWGP['OwnerId'].max()
print(f"Count:{MRWGPUserIdCount}, min:{MRWGPUserIdMin}, max:{MRWGPUserIdMax}")

Count:4006, min:EXL_1569434212261, max:EXL_1703844536464


## Postcode (Garage Profile)

In [321]:
MRWGP['Postcode'].unique()

# there is a white space after District!!
MRWGP.rename(columns={"District ": "District"}, inplace=True)

# convert postcode to int
MRWGP["Postcode"] = MRWGP["Postcode"].astype(pd.Int64Dtype())

# scenario: postcode exist but district is missing
MRWGP[(MRWGP['Postcode'].notna()) & (MRWGP['District'] == "เขต")]

Unnamed: 0,OwnerId,Postcode,Province,District,Distributor
100,EXL_1657555803230,64000,สุโขทัย,เขต,C
191,EXL_1576587181605,11000,นนทบุรี,เขต,A
348,EXL_1573049265095,11000,นนทบุรี,เขต,A
361,EXL_1572691481498,11000,นนทบุรี,เขต,A
481,EXL_1579261368753,11000,นนทบุรี,เขต,A
680,EXL_1605709537255,60000,นครสวรรค์,เขต,C
1009,EXL_1700227450476,64000,สุโขทัย,เขต,C
1097,EXL_1601549347440,60000,นครสวรรค์,เขต,C
1140,EXL_1575643636201,60000,นครสวรรค์,เขต,C
1456,EXL_1585064765189,60000,นครสวรรค์,เขต,C


In [322]:
# https://github.com/rathpanyowat/Thai-zip-code-latitude-and-longitude/blob/master/data.json

postcode = pd.read_excel("../data/PostcodeData.xlsx")
postcode['district'] = "เขต" + postcode['district']

postcodeDict = postcode[["zip", "district"]].set_index("zip").to_dict()['district']

In [323]:
#MRWGP.loc[(MRWGP['Postcode'].notna()) & (MRWGP['District'] == "เขต"), "District"]
MRWGP[(MRWGP['Postcode'].notna()) & (MRWGP['District'] == "เขต")]["Postcode"]

def findDistrict(record):
    if record not in postcodeDict:
        return np.nan
    
    return postcodeDict[record]

districtLabel = MRWGP[(MRWGP['Postcode'].notna()) & (MRWGP['District'] == "เขต")]["Postcode"].apply(findDistrict)
MRWGP.loc[(MRWGP['Postcode'].notna()) & (MRWGP['District'] == "เขต"), "District"] = districtLabel
MRWGP[(MRWGP['Postcode'].notna()) & (MRWGP['District'].isna())] # can not be filled, wait there is only one! 

Unnamed: 0,OwnerId,Postcode,Province,District,Distributor
2069,EXL_1694685410503,11100,นนทบุรี,,A


In [324]:
 # apprently there are alreay this postcode in the data and it correspond to this district name
MRWGP.loc[MRWGP['Postcode'] == 11100, 'District'] = "เขตเมืองนนทบุรี"  
MRWGP[(MRWGP['Postcode'].notna()) & (MRWGP['District'].isna())] 

Unnamed: 0,OwnerId,Postcode,Province,District,Distributor


Since we will be using districtname we will not continute cleaning postcode

## Province (Garage Profile)

In [325]:
MRWGP['Province'].unique()

array(['สุพรรณบุรี', 'เชียงราย', 'กรุงเทพมหานคร', 'สมุทรปราการ',
       'นครราชสีมา', 'พิษณุโลก', 'ระยอง', 'ชลบุรี', 'ปทุมธานี',
       'เชียงใหม่', 'ขอนแก่น', 'ภูเก็ต', 'สระบุรี', 'สมุทรสาคร',
       'นนทบุรี', 'สุราษฎร์ธานี', 'พัทลุง', 'สมุทรสงคราม', 'กำแพงเพชร',
       'นครสวรรค์', 'อุตรดิตถ์', 'ตราด', 'ชัยนาท', 'นครปฐม', 'ราชบุรี',
       'สระแก้ว', 'นครศรีธรรมราช', 'พระนครศรีอยุธยา', 'ศรีสะเกษ',
       'ฉะเชิงเทรา', 'อุบลราชธานี', 'อุดรธานี', 'สุโขทัย', 'น่าน',
       'เพชรบูรณ์', 'สิงห์บุรี', 'เพชรบุรี', 'พังงา', 'พิจิตร',
       'จันทบุรี', 'ประจวบคีรีขันธ์', 'พะเยา', 'กระบี่', 'บึงกาฬ',
       'ลำปาง', 'ตรัง', 'สงขลา', 'ลำพูน', 'แพร่', 'บุรีรัมย์', 'เลย',
       'ยโสธร', 'ลพบุรี', 'ปราจีนบุรี', 'อำนาจเจริญ', 'สุรินทร์',
       'สกลนคร', 'ชุมพร', 'ร้อยเอ็ด', 'ปัตตานี', 'นครพนม', 'หนองคาย',
       'ชัยภูมิ', 'กาฬสินธุ์', 'นครนายก', 'ตาก', 'ระนอง', 'มหาสารคาม',
       'แม่ฮ่องสอน', 'กาญจนบุรี', 'หนองบัวลำภู', 'มุกดาหาร', 'อุทัยธานี',
       'ยะลา', 'นราธิวาส', 'อ่างทอง', 'สตูล'], 

## District (Garage Profile)

In [326]:
MRWGP.loc[MRWGP['District'] == "เขต", 'District'] = np.nan # just flag it as null
MRWGP[MRWGP['District'].isna()] # there is nothing we can do bro

Unnamed: 0,OwnerId,Postcode,Province,District,Distributor
643,EXL_1627239745637,,กรุงเทพมหานคร,,A
735,EXL_1676365474904,,กรุงเทพมหานคร,,A
1088,EXL_1570717772373,,เชียงราย,,C
1113,EXL_1684409855383,,กรุงเทพมหานคร,,A
1524,EXL_1662127146907,,เพชรบูรณ์,,C
1547,EXL_1663447911873,,กรุงเทพมหานคร,,A
1579,EXL_1594294562047,,ภูเก็ต,,D
1596,EXL_1652786934570,,กรุงเทพมหานคร,,A
1688,EXL_1592653287874,,ราชบุรี,,B
2370,EXL_1643035798677,,ศรีสะเกษ,,A


## Points/Bottle (ProductSKU)

In [327]:
MRWSKU['Points']

0    60
1    50
2    50
3    45
4    40
5    40
6    35
7    20
8    15
9    10
Name: Points, dtype: int64

## Volume (ProductSKU)

In [328]:
MRWSKU['Volume (L)']

0    6
1    4
2    4
3    7
4    5
5    5
6    6
7    7
8    5
9    7
Name: Volume (L), dtype: int64

## GarageId (User)

In [329]:
MRWU['GarageId'].isna().any()
MRWUGarIdMin = MRWU['GarageId'].min()
MRWUGarIdMax = MRWU['GarageId'].max()
print(f"min: {MRWUGarIdMin}, max:{MRWUGarIdMax}")

min: AA3539, max:ZZ9482


## UserType (User)

In [330]:
MRWU['UserType'].unique()

array(['Owner', 'Employee'], dtype=object)

# Join time!

## Prepare Table

### MRWT

In [331]:
cols = MRWT.columns.values.tolist()
MRWT = MRWT.groupby(cols).size().reset_index().rename(columns={0: "Count"})
MRWT

Unnamed: 0,Distributor,ProductName,ProductSKU,ScanDate,UserId,Count
0,A,Blue Large,Blue - Large 4L - 5L,2023-01-04,EXL_1573744836492,2
1,A,Blue Large,Blue - Large 4L - 5L,2023-01-17,EXL_1573744836492,1
2,A,Blue Large,Blue - Large 4L - 5L,2023-03-01,EXL_1573744836492,1
3,A,Blue Large,Blue - Large 4L - 5L,2023-05-03,EXL_1619694294901,1
4,A,Blue Large,Blue - Large 4L - 5L,2023-06-15,EXL_1592832015021,1
...,...,...,...,...,...,...
36010,D,Silver Large,Silver - Large 4L,2023-12-26,EXL_1636202798697,4
36011,D,Silver Large,Silver - Large 4L,2023-12-26,EXL_1691757071956,1
36012,D,Silver Large,Silver - Large 4L,2023-12-27,EXL_1691757071956,2
36013,D,Silver Large,Silver - Large 4L,2023-12-28,EXL_1691757071956,1


### MRWU

In [332]:
MRWU

Unnamed: 0,UserId,GarageId,UserType
0,EXL_1569429819465,TX4276,Owner
1,EXL_1569431062769,IP0146,Owner
2,EXL_1569431134633,AF0708,Employee
3,EXL_1569431146924,TX4276,Employee
4,EXL_1569432034271,TX4276,Employee
...,...,...,...
4714,EXL_1716585305340,TX6300,Owner
4715,EXL_1716824732478,RS5281,Owner
4716,EXL_1717078601618,QY6868,Owner
4717,EXL_1717153057699,AN6271,Owner


In [333]:
(MRWU[MRWU['UserType'] == "Owner"].groupby("GarageId").count() == 1).all() #One GarageId has one Owner

UserId      True
UserType    True
dtype: bool

In [334]:
GarageIdToOwner = MRWU[MRWU['UserType'] == "Owner"].set_index("GarageId")['UserId']
MRUW = MRWU.set_index("GarageId").join(GarageIdToOwner, rsuffix="Owner").reset_index()
# check
(MRUW[MRUW['UserType'] == "Owner"]["UserId"] == MRUW[MRUW['UserType'] == "Owner"]["UserId"]).all() #alright
MRUW

Unnamed: 0,GarageId,UserId,UserType,UserIdOwner
0,TX4276,EXL_1569429819465,Owner,EXL_1569429819465
1,IP0146,EXL_1569431062769,Owner,EXL_1569431062769
2,AF0708,EXL_1569431134633,Employee,EXL_1569921668308
3,TX4276,EXL_1569431146924,Employee,EXL_1569429819465
4,TX4276,EXL_1569432034271,Employee,EXL_1569429819465
...,...,...,...,...
4714,TX6300,EXL_1716585305340,Owner,EXL_1716585305340
4715,RS5281,EXL_1716824732478,Owner,EXL_1716824732478
4716,QY6868,EXL_1717078601618,Owner,EXL_1717078601618
4717,AN6271,EXL_1717153057699,Owner,EXL_1717153057699


In [335]:
MRWUserWithGarage = MRUW.set_index("UserIdOwner").join(MRWGP.set_index("OwnerId")).reset_index().drop(columns="Distributor")


In [336]:
# join transaction with sku
MRWT = MRWT.set_index("ProductSKU").join(MRWSKU.set_index("ProductSKU")).reset_index()
MRWT


Unnamed: 0,ProductSKU,Distributor,ProductName,ScanDate,UserId,Count,Points,Volume (L)
0,Blue - Large 4L - 5L,A,Blue Large,2023-01-04,EXL_1573744836492,2,15,5
1,Blue - Large 4L - 5L,A,Blue Large,2023-01-17,EXL_1573744836492,1,15,5
2,Blue - Large 4L - 5L,A,Blue Large,2023-03-01,EXL_1573744836492,1,15,5
3,Blue - Large 4L - 5L,A,Blue Large,2023-05-03,EXL_1619694294901,1,15,5
4,Blue - Large 4L - 5L,A,Blue Large,2023-06-15,EXL_1592832015021,1,15,5
...,...,...,...,...,...,...,...,...
36010,Silver - Large 4L,D,Silver Large,2023-12-26,EXL_1636202798697,4,50,4
36011,Silver - Large 4L,D,Silver Large,2023-12-26,EXL_1691757071956,1,50,4
36012,Silver - Large 4L,D,Silver Large,2023-12-27,EXL_1691757071956,2,50,4
36013,Silver - Large 4L,D,Silver Large,2023-12-28,EXL_1691757071956,1,50,4


In [337]:
# join with UserWithGarage
MRWT = MRWT.set_index("UserId").join(MRWUserWithGarage.set_index("UserId")).reset_index()

In [338]:
MRWT

Unnamed: 0,UserId,ProductSKU,Distributor,ProductName,ScanDate,Count,Points,Volume (L),UserIdOwner,GarageId,UserType,Postcode,Province,District
0,EXL_1573744836492,Blue - Large 4L - 5L,A,Blue Large,2023-01-04,2,15,5,EXL_1573744836492,BC0891,Owner,10260,กรุงเทพมหานคร,เขตบางนา
1,EXL_1573744836492,Blue - Large 4L - 5L,A,Blue Large,2023-01-17,1,15,5,EXL_1573744836492,BC0891,Owner,10260,กรุงเทพมหานคร,เขตบางนา
2,EXL_1573744836492,Blue - Large 4L - 5L,A,Blue Large,2023-03-01,1,15,5,EXL_1573744836492,BC0891,Owner,10260,กรุงเทพมหานคร,เขตบางนา
3,EXL_1619694294901,Blue - Large 4L - 5L,A,Blue Large,2023-05-03,1,15,5,EXL_1619694294901,WO0982,Owner,10510,กรุงเทพมหานคร,เขตคลองสามวา
4,EXL_1592832015021,Blue - Large 4L - 5L,A,Blue Large,2023-06-15,1,15,5,EXL_1592832015021,JE4271,Owner,10540,สมุทรปราการ,เขตบางพลี
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36010,EXL_1636202798697,Silver - Large 4L,D,Silver Large,2023-12-26,4,50,4,EXL_1636202798697,QS3164,Owner,90110,สงขลา,เขตหาดใหญ่
36011,EXL_1691757071956,Silver - Large 4L,D,Silver Large,2023-12-26,1,50,4,EXL_1691757071956,IB0726,Owner,93110,พัทลุง,เขตควนขนุน
36012,EXL_1691757071956,Silver - Large 4L,D,Silver Large,2023-12-27,2,50,4,EXL_1691757071956,IB0726,Owner,93110,พัทลุง,เขตควนขนุน
36013,EXL_1691757071956,Silver - Large 4L,D,Silver Large,2023-12-28,1,50,4,EXL_1691757071956,IB0726,Owner,93110,พัทลุง,เขตควนขนุน


In [339]:
MRWT.to_csv("../data/MRWCleaned.csv")