In [36]:
import pandas as pd
import pandas_profiling
import torch
import numpy as np
import os 
import stat
import seaborn as sns
import matplotlib.pyplot as plt
import tqdm
import sys

In [13]:
logit=pd.read_csv("Logistics/train.csv")
logit.rename(columns={'송하인_격자공간고유번호':'shipper',
                     '수하인_격자공간고유번호':'recipient',
                     '물품_카테고리':'category',
                     '운송장_건수':'freq'},inplace=True)

# EDA

## First Look

* There are total 31684 cases of delivery reported.
* There are total 4229 independent shippers and 26875 independent recipient.
* On average one shippers made 7~8 deliveries.

In [89]:
len(logit)

31684

In [28]:
logit['shipper'].value_counts()

5011000078068400    3882
5013000610049100    1790
5013000731055200    1235
5011000137030100    1137
5013000821028200     731
                    ... 
4511300030055300       1
4719000321023400       1
4423000423020300       1
4711300748080100       1
2811000139076100       1
Name: shipper, Length: 4229, dtype: int64

In [88]:
logit['recipient'].value_counts()

5013000635005300    35
5011000543041100    33
5011000314069300    33
5011000544072300    30
5011000318044100    30
                    ..
5011000109040100     1
4183000550093100     1
4425000028078200     1
1147000011064300     1
5011000263065200     1
Name: recipient, Length: 26875, dtype: int64

In [30]:
logit['category'].value_counts()

농산물              20321
문화컨텐츠             1091
음료                1034
수산                 860
가공식품               846
                 ...  
유아가구                 4
기타출산/육아              4
태블릿PC/노트북액세서리        4
선케어                  4
스포츠잡화                4
Name: category, Length: 100, dtype: int64

In [42]:
logistics_profile=logit.profile_report()
logistics_profile.to_file('Logistics_profile.html')

## Expanding Dimensions.

Given data set has enough number of observations, but at current state, it would be impossible to make predictions as there is only one useful information, category variable.

Thus, expansion of dimension through data-processing would be necessary. 

Though shipper id is very long and does not seem to have much room for interpretation, there seems to be a certain pattern. Perhaps certain digits group shippers with common identity.

Hypothesis 1: First three digits form a group.
Hypothesis 2: First four digits form a group 

In [118]:
onetothree=[]
onetofour=[]
for i in range(len(logit)):
    onetothree.append(int(str(logit['shipper'][i])[0:3]))
    onetofour.append(int(str(logit['shipper'][i])[0:4]))

In [162]:
group1_counts=pd.DataFrame(pd.DataFrame(onetothree).value_counts())
group1_counts

Unnamed: 0_level_0,0
0,Unnamed: 1_level_1
501,23461
414,2083
415,1043
412,502
413,455
...,...
458,4
287,3
422,3
469,2


In [120]:
pd.DataFrame(onetofour).value_counts()

5011    13030
5013    10431
4148     1199
4141      472
4159      394
        ...  
4792        1
4275        1
4577        1
4182        1
4219        1
Length: 215, dtype: int64

In [156]:
logit['group1']=pd.DataFrame(onetothree)

Grouping with first three digits form much lesser number of group than with four digits. 
Creating size of 64 vector to represent first group would be very unpractical (Big computational power and bad interpretability).

Divide into groups by number of delieveries recorded.

Group 1: >=10000
Group 2: >=1000 & <10000
Group 3: >=100
Group 4: <100

In [196]:
idcount=pd.concat([group1_counts.index.to_frame(),group1_counts],axis=1,keys=['ID1','Counts'])

In [277]:
idcount.iloc[,:]

ID1     0      501
Counts  0    23461
Name: (501,), dtype: int64

In [249]:
count=0
for i in range(len(logit)):
    count=idcount['Counts'].iloc[[np.where(idcount['ID1']==logit['group1'][i])[0].item()],0].item()
    if count>=10000:
        logit['group1'][i]=0
    elif count>=1000 and count<10000:
        logit['group1'][i]=1
    elif count>=100 and count<1000:
        logit['group1'][i]=2
    else:
        logit['group1'][i]=3

ValueError: can only convert an array of size 1 to a Python scalar

In [278]:
onetothree=[]
onetofour=[]
for i in range(len(logit)):
    onetothree.append(int(str(logit['shipper'][i])[3:6]))
    onetofour.append(int(str(logit['shipper'][i])[3:7]))

In [279]:
group1_counts=pd.DataFrame(pd.DataFrame(onetothree).value_counts())

In [280]:
group1_counts

Unnamed: 0_level_0,0
0,Unnamed: 1_level_1
100,14153
300,11074
800,1373
700,822
900,657
0,635
600,550
500,547
610,283
400,271
