In [None]:
# Data manipulation
import pandas as pd
import numpy as np

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
sns.set_style('darkgrid')

# Scientific Calculation
from scipy import stats

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
raw_data = pd.read_csv('./Air_Traffic_Passenger_Statistics.csv')

In [None]:
raw_data.head()

In [None]:
raw_data.shape

In [None]:
raw_data.info()

# Data Preprocessing

In [None]:
data = raw_data.copy(deep=True)

In [None]:
data.columns

In [None]:
data.columns = data.columns.str.replace(' ', '_')

In [None]:
data['GEO_Region'].value_counts()

In [None]:
objcols = data.select_dtypes(include='object').columns

In [None]:
data.nunique()

In [None]:
data.isnull().sum()

In [None]:
data.dropna(inplace=True)

In [None]:
data.columns

In [None]:
cols = ['GEO_Summary', 'GEO_Region', 'Activity_Type_Code', 'Price_Category_Code', 'Terminal',
        'Boarding_Area', 'Adjusted_Activity_Type_Code', 'Year', 'Month']

In [None]:
fig, axes = plt.subplots(ncols=3, nrows=3, figsize=(20, 10))
axes = axes.flat

for index, column in enumerate(data[cols]):
    sns.countplot(
        data=data,
        x=column,
        ax=axes[index]
    )
    axes[index].set_xlabel('')
    axes[index].set_title(column)

fig.suptitle('Distribution of Categorical columns', size=20)
fig.tight_layout()


In [None]:
fig = px.pie(data_frame=data, names='GEO_Summary', values='GEO_Summary')
fig.show()


In [None]:
from sklearn.preprocessing import OrdinalEncoder

encoder = OrdinalEncoder()
data[objcols] = encoder.fit_transform(data[objcols].values)

In [None]:
plt.figure(figsize=(12, 8))
sns.heatmap(data.corr(), annot=True, cmap='viridis')

In [None]:
data.columns

In [None]:
data = data.drop(
        columns=['Activity_Period', 'Operating_Airline', 'Operating_Airline_IATA_Code',
        'Published_Airline', 'Published_Airline_IATA_Code', 'Adjusted_Activity_Type_Code']
        )

In [None]:
from sklearn.cluster import KMeans

wcss = []
k_range = range(1, 7)
for k in k_range:
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(data)
    wcss.append(kmeans.inertia_)

In [None]:
# Plot the WCSS versus the number of clusters
plt.plot(k_range, wcss, '*--', c='red')
plt.xlabel('Number of clusters')
plt.ylabel('Within-cluster sum of squares')
plt.show()

In [None]:
# Create a KMeans object with the optimal number of clusters
kmeans = KMeans(n_clusters=3)
# Fit the model to the scaled data
kmeans.fit(data)

# Get the cluster labels
labels = kmeans.labels_

# Add the labels to the original dataframe
data['cluster'] = labels

# Print the first 5 rows of the original dataframe with the added cluster labels
data.head()

In [None]:
# Evaluate the model
from sklearn import metrics

# Calculate the silhouette score
silhouette_score = metrics.silhouette_score(data, labels, metric='euclidean')
print("Silhouette Score: ", silhouette_score)

In [None]:
# Splitting the dataset into target and features
x=data.iloc[:,1:-1]
y=data.iloc[:,-1]

In [None]:
# Splitting the data to train and test
from sklearn.model_selection import train_test_split

xtrain,xtest,ytrain,ytest=train_test_split(x,y,random_state=2,test_size=0.20,stratify=y)

In [None]:
print('xtrain shape: ', xtrain.shape)
print('xtest shape: ', xtest.shape)
print('ytrain shape: ', ytrain.shape)
print('ytest shape: ',ytest.shape)

In [None]:
# Training the model
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(xtrain,ytrain)
ypred = knn.predict(xtest)

In [None]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
ac = accuracy_score(ytest,ypred)
cm = confusion_matrix(ytest,ypred)
cr = classification_report(ytest,ypred)
print(f'Accuracy:{ac}\n {cm} \n{cr}')