In [None]:
# Created by Viswadeep Sarangi
# Last updated: 14 August 2020

# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, export_graphviz

In [None]:
import torch

Loading up the csv file

In [None]:
df = pd.read_csv('/kaggle/input/us-border-crossing-data/Border_Crossing_Entry_Data.csv')

print(df.info())

Splitting the dataframe into train and test

## **Preprocessing**

http://chrisstrelioff.ws/sandbox/2015/06/08/decision_trees_in_python_with_scikit_learn_and_pandas.html

In order to pass this data into scikit-learn we need to encode the Names to integers. To do this we’ll write another function and return the modified data frame as well as a list of the target (class) names:

In [None]:
def encode_target(df, target_column, drop_original_column=False):
    """Add column to df with integers for the target.

    Args
    ----
    df -- pandas DataFrame.
    target_column -- column to map to int, producing
                     new Target column.

    Returns
    -------
    df_mod -- modified DataFrame.
    targets -- list of target names.
    """
    df_mod = df.copy()
    targets = df_mod[target_column].unique()
    map_to_int = {name: n for n, name in enumerate(targets)}
    df_mod["Target"] = df_mod[target_column].replace(map_to_int)

    if(drop_original_column):
        df_mod = df_mod.drop(columns=[target_column])
        print("Dropped Column: {}".format(target_column))
    
    return (df_mod, targets)

In [None]:
def encode_and_replace_column(df, target_column):
    """Add column to df with integers for the target.

    Args
    ----
    df -- pandas DataFrame.
    target_column -- column to map to int, producing
                     new Target column.

    Returns
    -------
    df_mod -- modified DataFrame.
    targets -- list of target names.
    """
    df_mod = df.copy()
    targets = df_mod[target_column].unique()
    map_to_int = {name: n for n, name in enumerate(targets)}
    df_mod[target_column] = df_mod[target_column].replace(map_to_int)

    return (df_mod, targets)

In [None]:
def visualize_tree(tree, feature_names):
    """Create tree png using graphviz.

    Args
    ----
    tree -- scikit-learn DecsisionTree.
    feature_names -- list of feature names.
    """
    with open("dt.dot", 'w') as f:
        export_graphviz(tree, out_file=f,
                        feature_names=feature_names)

    command = ["dot", "-Tpng", "dt.dot", "-o", "dt.png"]
    try:
        subprocess.check_call(command)
    except:
        exit("Could not run dot, ie graphviz, to "
             "produce visualization")

## 1. Objective: To use a ID3 decision tree from sklearn to classify "State"

In [None]:
# Creating the numerical targets instead of the categorical state names
df_state, targets_state = encode_target(df, "State", drop_original_column=True)

# Converting a lot of categorical values into numerical values
df_state, _ = encode_and_replace_column(df_state, "Border")
df_state, _ = encode_and_replace_column(df_state, "Measure")

# Converting the Date column into numerical values for now only. TODO. Convert them into proper DateTime formats
df_state, _ = encode_and_replace_column(df_state, "Date")

# Don't really need the "Port Name" column, "Port Code" is sufficient
df_state = df_state.drop(columns=["Port Name"], axis = 1)

In [None]:
print("@@@ Before")
print(df.info())
print(df.head())
print("\n@@@ After")
print(df_state.info())
print(df_state.head())

In [None]:
train, test = train_test_split(df_state, test_size=0.2)

print(train.info())
print()
print(test.info())

In [None]:
print(train.info())
print()
print(test.info())
print()
print(targets_state)
print()
print(train.head())
print()
print(test.head())

In [None]:
features = list(df_state.columns[:5])
print("* features:", features, sep="\n")

We can now fit the Decision Tree in classifying the "State"

In [None]:
y = train["Target"]
X = train[features]
dt = DecisionTreeClassifier(min_samples_split=20, random_state=99)
dt.fit(X, y)

In [None]:
visualize_tree(dt, features)

Now that the Decision Tree is trained, time to test it

In [None]:
b = test["Target"]
A = test[features]
b_np = b.to_numpy()

print(b)
print()
print(b_np)
print()
print(A.info())

In [None]:
accuracy = dt.score(A, b)
print("Mean Accuracy: {}%".format(accuracy*100))

In [None]:
predictions = dt.predict(A)

for i in range(len(predictions)):
    print("Predicted: {}, Actual:{}".format(targets_state[predictions[i]], targets_state[b_np[i]]))

# 2. Objective : Create a Feed Forward Neural Network in PyTorch to classify State

In [None]:
dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10


### Random Test

In [None]:
# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Use the nn package to define our model and loss function.
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)
loss_fn = torch.nn.MSELoss(reduction='sum')

# Use the optim package to define an Optimizer that will update the weights of
# the model for us. Here we will use Adam; the optim package contains many other
# optimization algorithms. The first argument to the Adam constructor tells the
# optimizer which Tensors it should update.
learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
for t in range(500):
    # Forward pass: compute predicted y by passing x to the model.
    y_pred = model(x)

    # Compute and print loss.
    loss = loss_fn(y_pred, y)
    if t % 100 == 99:
        print(t, loss.item())

    # Before the backward pass, use the optimizer object to zero all of the
    # gradients for the variables it will update (which are the learnable
    # weights of the model). This is because by default, gradients are
    # accumulated in buffers( i.e, not overwritten) whenever .backward()
    # is called. Checkout docs of torch.autograd.backward for more details.
    optimizer.zero_grad()

    # Backward pass: compute gradient of the loss with respect to model
    # parameters
    loss.backward()

    # Calling the step function on an Optimizer makes an update to its
    # parameters
    optimizer.step()