# Comtrade Exploretaroy Analysis

### import the required libraries and fetch the data

In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import comtradeapicall
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from category_encoders import TargetEncoder

Fetching the data

In [29]:
df = comtradeapicall.previewFinalData(
    typeCode='C', 
    freqCode='M', 
    clCode='HS', 
    period='202201',
    reporterCode='36',
    cmdCode='91',
    flowCode=None,
    partnerCode=None,
    partner2Code=None,
    customsCode=None,
    motCode=None, 
    maxRecords=500, 
    format_output='JSON',
    aggregateBy=None, 
    breakdownMode='classic', 
    countOnly=None, 
    includeDesc=True
)


## Previewing the data and it's description

In [88]:
print(df.describe())

       refPeriodId  refYear  refMonth  reporterCode  partnerCode  \
count         87.0     87.0      87.0          87.0    87.000000   
mean    20220101.0   2022.0       1.0          36.0   447.183908   
std            0.0      0.0       0.0           0.0   255.446783   
min     20220101.0   2022.0       1.0          36.0     0.000000   
25%     20220101.0   2022.0       1.0          36.0   246.500000   
50%     20220101.0   2022.0       1.0          36.0   458.000000   
75%     20220101.0   2022.0       1.0          36.0   700.500000   
max     20220101.0   2022.0       1.0          36.0   842.000000   

       partner2Code  aggrLevel  motCode  qtyUnitCode   qty  altQtyUnitCode  \
count          87.0       87.0     87.0         87.0  87.0            87.0   
mean            0.0        2.0      0.0         -1.0   0.0            -1.0   
std             0.0        0.0      0.0          0.0   0.0             0.0   
min             0.0        2.0      0.0         -1.0   0.0            -1.0 

In [89]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87 entries, 0 to 86
Data columns (total 47 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   typeCode                  87 non-null     object 
 1   freqCode                  87 non-null     object 
 2   refPeriodId               87 non-null     int64  
 3   refYear                   87 non-null     int64  
 4   refMonth                  87 non-null     int64  
 5   period                    87 non-null     object 
 6   reporterCode              87 non-null     int64  
 7   reporterISO               87 non-null     object 
 8   reporterDesc              87 non-null     object 
 9   flowCode                  87 non-null     object 
 10  flowDesc                  87 non-null     object 
 11  partnerCode               87 non-null     int64  
 12  partnerISO                87 non-null     object 
 13  partnerDesc               87 non-null     object 
 14  partner2Code

In [90]:
columns = df.columns.to_list()
for column in columns:
    print(f'Column {column}')

Column typeCode
Column freqCode
Column refPeriodId
Column refYear
Column refMonth
Column period
Column reporterCode
Column reporterISO
Column reporterDesc
Column flowCode
Column flowDesc
Column partnerCode
Column partnerISO
Column partnerDesc
Column partner2Code
Column partner2ISO
Column partner2Desc
Column classificationCode
Column classificationSearchCode
Column isOriginalClassification
Column cmdCode
Column cmdDesc
Column aggrLevel
Column isLeaf
Column customsCode
Column customsDesc
Column mosCode
Column motCode
Column motDesc
Column qtyUnitCode
Column qtyUnitAbbr
Column qty
Column isQtyEstimated
Column altQtyUnitCode
Column altQtyUnitAbbr
Column altQty
Column isAltQtyEstimated
Column netWgt
Column isNetWgtEstimated
Column grossWgt
Column isGrossWgtEstimated
Column cifvalue
Column fobvalue
Column primaryValue
Column legacyEstimationFlag
Column isReported
Column isAggregate


## Data types

In [67]:
categorical_features = []
continuos_features = []
discrete_features = []

categorical_features = list(filter(lambda item: df[item].dtype == 'object', df.columns))
continuos_features = list(filter(lambda item:
                                  df[item].dtype == 'float64' and
                                  df[item].unique().size > 10
                                , df.columns))

discrete_features = list(filter(lambda item:
                                  df[item].dtype == 'int64' and 
                                  df[item].unique().size > 10
                                , df.columns))

Categorical Features

In [68]:
for feature in categorical_features:
    print(f"Cateorical Feature: {feature}")

Cateorical Feature: typeCode
Cateorical Feature: freqCode
Cateorical Feature: period
Cateorical Feature: reporterISO
Cateorical Feature: reporterDesc
Cateorical Feature: flowCode
Cateorical Feature: flowDesc
Cateorical Feature: partnerISO
Cateorical Feature: partnerDesc
Cateorical Feature: partner2ISO
Cateorical Feature: partner2Desc
Cateorical Feature: classificationCode
Cateorical Feature: classificationSearchCode
Cateorical Feature: cmdCode
Cateorical Feature: cmdDesc
Cateorical Feature: customsCode
Cateorical Feature: customsDesc
Cateorical Feature: mosCode
Cateorical Feature: motDesc
Cateorical Feature: qtyUnitAbbr
Cateorical Feature: altQtyUnitAbbr


Continuos Features

In [69]:
for feature in continuos_features:
    print(f"Continuos Feature: {feature}")

Continuos Feature: cifvalue
Continuos Feature: fobvalue
Continuos Feature: primaryValue


In [70]:
for feature in discrete_features:
    print(f"Discrete Feature: {feature}")

Discrete Feature: partnerCode


In [84]:
other_features = df.columns.difference(categorical_features)
other_features = other_features.difference(discrete_features)
other_features = other_features.difference(continuos_features)

for other in other_features:
    print(f'Unclassified Feature: {other}')


Unclassified Feature: aggrLevel
Unclassified Feature: altQty
Unclassified Feature: altQtyUnitCode
Unclassified Feature: grossWgt
Unclassified Feature: isAggregate
Unclassified Feature: isAltQtyEstimated
Unclassified Feature: isGrossWgtEstimated
Unclassified Feature: isLeaf
Unclassified Feature: isNetWgtEstimated
Unclassified Feature: isOriginalClassification
Unclassified Feature: isQtyEstimated
Unclassified Feature: isReported
Unclassified Feature: legacyEstimationFlag
Unclassified Feature: motCode
Unclassified Feature: netWgt
Unclassified Feature: partner2Code
Unclassified Feature: qty
Unclassified Feature: qtyUnitCode
Unclassified Feature: refMonth
Unclassified Feature: refPeriodId
Unclassified Feature: refYear
Unclassified Feature: reporterCode


### Insights:
1. These unclassified features has issues some of it are not real (like all values zeros) or it's codes, and the other are booleans

2. The Quantity is not provided in the data set so we need to fix this to set a label , y feature

## Visualization