# National Cluster model

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
from sklearn.cluster import AgglomerativeClustering 

In [2]:
table1 = pd.read_csv(r"C:\Users\jesss\OneDrive\Documents\GitHub\ITCS3162\datasets\comodity2017.csv")

In [3]:
table1["EXPORT_YN"] = table1["EXPORT_YN"].astype("category").cat.codes
table1["EXPORT_CNTRY"] = table1["EXPORT_CNTRY"].astype("category").cat.codes

export_analysis = table1[[
    "EXPORT_YN",
    "EXPORT_CNTRY",
    "SHIPMT_VALUE",
    "SHIPMT_WGHT"
]]

scaled = StandardScaler().fit_transform(export_analysis)

results = []
for k in range(2, 8):
    km = KMeans(n_clusters=k, random_state=42)
    labels = km.fit_predict(scaled)
    inertia = km.inertia_
    silhouette = silhouette_score(
        scaled, labels, sample_size=5000, random_state=42
    )

    results.append([k, inertia, silhouette])

print("k | Inertia (WSS) | Silhouette Score")
for k, inertia, sil in results:
    print(f"{k} | {inertia:.2f} | {sil:.4f}")

k | Inertia (WSS) | Silhouette Score
2 | 2315921.54 | 0.9808
3 | 1462119.95 | 0.9808
4 | 985947.97 | 0.9813
5 | 854803.39 | 0.9815
6 | 539913.61 | 0.9824
7 | 375262.19 | 0.9824


To describe my data mining process for this clustering model first I will describe what this model is. This is a comparative K-means clustering model that uses several values of k. This model is meant to calculate inertia and silhouette scores for each tested k value to evaluate the cluster quality. The reason why I am made this model is to discover any patterns in export shipments, I also wanted to determine with the clustering model how shipments group together based on export status, destination country, value, and weight. How I made this clustering model is by first making EXPORT_YN and EXPORT_CNTRY numerical categories, then I made a table with important factors for analying the exporting. After that I scaled the selected features and compared multiple K values, lastly I made the compared multiple K values results visual. 