In [69]:
# importing all required libraries
import numpy as np # used for handling numbers
import pandas as pd # used for handling the dataset
from sklearn.impute import SimpleImputer # used for handling missing data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder # used for encoding categorical data
from sklearn.model_selection import train_test_split # used for splitting training and testing data
from sklearn.preprocessing import StandardScaler # used for feature scaling
import os
import csv 

In [62]:
# Read CSV File from Data Folder
filename = "cfs_pumf.txt"

try:
    dataset = pd.read_csv(f'{os.getcwd()}\\data\\{filename}')      
except IOError:
    print(f'Could not read file {filename}')

In [63]:
# Analyze the datatype of all the variables and convert to appropriate one
dataset.dtypes.to_dict()

{'SHIPMT_ID': dtype('int64'),
 'ORIG_STATE': dtype('int64'),
 'ORIG_MA': dtype('int64'),
 'ORIG_CFS_AREA': dtype('O'),
 'DEST_STATE': dtype('int64'),
 'DEST_MA': dtype('int64'),
 'DEST_CFS_AREA': dtype('O'),
 'NAICS': dtype('int64'),
 'QUARTER': dtype('int64'),
 'SCTG': dtype('O'),
 'MODE': dtype('int64'),
 'SHIPMT_VALUE': dtype('int64'),
 'SHIPMT_WGHT': dtype('int64'),
 'SHIPMT_DIST_GC': dtype('int64'),
 'SHIPMT_DIST_ROUTED': dtype('int64'),
 'TEMP_CNTL_YN': dtype('O'),
 'EXPORT_YN': dtype('O'),
 'EXPORT_CNTRY': dtype('O'),
 'HAZMAT': dtype('O'),
 'WGT_FACTOR': dtype('float64')}

In [49]:
#SCTG should be of type int64 but found as Object. Lets see why
dataset.groupby('SCTG')['SHIPMT_ID'].nunique()

#After the analysis, We found that SCTG also contains SCTG_Group values. We need to noramilize them

SCTG
00          691
01         2239
01-05      1458
02        24965
03        54265
04        45617
05        62355
06        53265
06-09      1267
07       183021
08        98947
09        16864
10         5883
10-14      1437
11        29142
12        94363
13        16795
14         4211
15        11002
15-19       856
17        38588
18        60218
19       133849
20        96474
20-24      1888
21        85890
22        24300
23       141016
24       288078
25         4966
25-30      1854
26       161961
27        61999
28        95548
29       200529
30       213796
31       178753
31-34      1473
32       187159
33       237139
34       265539
35       318586
35-38       919
36       183009
37        46078
38       132035
39        86920
39-99      1190
40       264089
41        41610
43       283551
99           14
Name: SHIPMT_ID, dtype: int64

In [64]:
# As per the data dictonary, EXPORT_CNTRY should have only tree values, C = Canada, M = Mexico, O = Other, but found one
# more category. Need to analyze it.
dataset.groupby('EXPORT_CNTRY')['SHIPMT_ID'].nunique()

EXPORT_CNTRY
C      54539
M      19367
N    4361940
O     111815
Name: SHIPMT_ID, dtype: int64

In [67]:
#Convert datatype of below variables to appropriate one
dataset.ORIG_STATE = dataset.ORIG_STATE.astype(str)
dataset.ORIG_MA = dataset.ORIG_MA.astype(str)
dataset.DEST_STATE = dataset.DEST_STATE.astype(str)
dataset.DEST_MA = dataset.DEST_MA.astype(str)
dataset.NAICS = dataset.NAICS.astype(str)
dataset.MODE = dataset.MODE.astype(str)

In [68]:
# Analyze the datatype of all the variables and convert to appropriate one
dataset.dtypes.to_dict()

{'SHIPMT_ID': dtype('int64'),
 'ORIG_STATE': dtype('O'),
 'ORIG_MA': dtype('O'),
 'ORIG_CFS_AREA': dtype('O'),
 'DEST_STATE': dtype('O'),
 'DEST_MA': dtype('O'),
 'DEST_CFS_AREA': dtype('O'),
 'NAICS': dtype('O'),
 'QUARTER': dtype('int64'),
 'SCTG': dtype('O'),
 'MODE': dtype('O'),
 'SHIPMT_VALUE': dtype('int64'),
 'SHIPMT_WGHT': dtype('int64'),
 'SHIPMT_DIST_GC': dtype('int64'),
 'SHIPMT_DIST_ROUTED': dtype('int64'),
 'TEMP_CNTL_YN': dtype('O'),
 'EXPORT_YN': dtype('O'),
 'EXPORT_CNTRY': dtype('O'),
 'HAZMAT': dtype('O'),
 'WGT_FACTOR': dtype('float64')}

In [73]:
np.set_printoptions(precision=3)
print(dataset.describe())

          SHIPMT_ID       QUARTER  SHIPMT_VALUE   SHIPMT_WGHT  SHIPMT_DIST_GC  \
count  4.547661e+06  4.547661e+06  4.547661e+06  4.547661e+06    4.547661e+06   
mean   2.273831e+06  2.445707e+00  1.827960e+04  3.758741e+04    3.860863e+02   
std    1.312797e+06  1.107542e+00  1.085257e+06  9.657841e+05    5.746514e+02   
min    1.000000e+00  1.000000e+00  1.000000e+00  1.000000e+00    1.000000e+00   
25%    1.136916e+06  1.000000e+00  1.760000e+02  1.500000e+01    1.700000e+01   
50%    2.273831e+06  2.000000e+00  8.750000e+02  2.700000e+02    1.100000e+02   
75%    3.410746e+06  3.000000e+00  6.026000e+03  8.077000e+03    5.380000e+02   
max    4.547661e+06  4.000000e+00  5.212778e+08  2.770296e+08    5.210000e+03   

       SHIPMT_DIST_ROUTED    WGT_FACTOR  
count        4.547661e+06  4.547661e+06  
mean         4.749791e+02  2.311014e+03  
std          6.869016e+02  1.934517e+04  
min          1.000000e+00  2.000000e-01  
25%          2.100000e+01  9.300000e+01  
50%          1.400

In [None]:
def splitDataset(dataset, splitRatio):    
    trainSize = int(len(dataset) * splitRatio)
    trainSet = []
    copy = list(dataset)
    while(len(trainSet) < trainSize):
        index = random.randrage(len(copy))
        trainSet.append(copy.pop(index))
    return [trainSet, copy]