In [6]:
from google.colab import drive 
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
!pip install category_encoders



In [8]:
pip install scikit-learn-extra



In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats
import seaborn as sns
sns.set_style('darkgrid')

from category_encoders import OrdinalEncoder

import pickle

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.mixture import GaussianMixture
from sklearn.cluster import DBSCAN

In [10]:
covtype_df = pd.read_csv("/content/drive/MyDrive/DMG Assignment 3/covtype_train.csv", skipinitialspace=True)

### Preprocessing

In [11]:
covtype_df.isna().sum()

Elevation                             0
Aspect                                0
Slope                                 0
Hillshade_9am                         0
Hillshade_Noon                        0
Horizontal_Distance_To_Hydrology      0
Vertical_Distance_To_Hydrology        0
Horizontal_Distance_To_Fire_Points    0
Soil_Type                             0
Wilderness                            0
target                                0
dtype: int64

In [12]:
# no of uniuqe values correpsonding to each column
columns = list(covtype_df)

total_count = covtype_df.shape[0]
for column in columns:
  skewed_percentage = len(covtype_df[covtype_df[column] == covtype_df[column].mode()[0]]) * 100/ total_count

In [13]:
skewed_columns=['Hillshade_9am', 'Hillshade_Noon','Elevation']
covtype_df.drop(columns=skewed_columns, inplace=True)
covtype_df.shape

(406708, 8)

In [14]:
encoding_scheme = { 'Aspect': {'aspect_low': 0, 'aspect_medium': 1, 'aspect_high': 2, 'aspect_ultra': 3},
                    'Slope': {'slope_low': 0, 'slope_medium': 1, 'slope_high': 2, 'slope_ultra': 3},
                    'Horizontal_Distance_To_Fire_Points': {'low': 0, 'mid': 1, 'high': 2}
                   }

categorical_col = ['Aspect', 'Slope', 'Horizontal_Distance_To_Fire_Points']

In [15]:
for column in categorical_col:
  encoder = OrdinalEncoder(cols = column, return_df = True, mapping=[{'col':column, 'mapping':encoding_scheme[column]}])
  covtype_df[column] = encoder.fit_transform(covtype_df[column])
  covtype_df[column] = covtype_df[column].astype(np.int64)

covtype_df

Unnamed: 0,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Fire_Points,Soil_Type,Wilderness,target
0,1,0,0,1,0,22,0,2
1,1,0,1,1,1,32,2,1
2,0,0,1,1,0,10,2,2
3,3,1,2,1,0,23,2,1
4,2,0,2,1,1,28,0,2
...,...,...,...,...,...,...,...,...
406703,1,0,1,1,1,39,0,7
406704,0,1,0,1,1,28,0,2
406705,1,0,0,1,1,28,0,2
406706,2,0,2,2,0,28,0,2


In [16]:
target_df = pd.DataFrame(covtype_df['target'], columns=['target'])
covtype_df.drop(['target'], axis=1, inplace=True)

In [17]:
saved_columns = list(covtype_df)

In [18]:
components = 2
pca = PCA(n_components = components)
pca_data = pca.fit_transform(covtype_df)
print('Total Variance Explained:', round(sum(list(pca.explained_variance_ratio_))*100, 2))

Total Variance Explained: 97.03


In [19]:
pca_columns = []
for (item1, item2) in zip(['Feature '] * components, np.arange(1, components + 1, 1)):
    pca_columns.append(item1 + str(item2))

In [20]:
pca_covtype_df = pd.DataFrame(pca_data, columns=pca_columns)
pca_covtype_df

Unnamed: 0,Feature 1,Feature 2
0,1.342248,-0.923441
1,-8.614018,0.180692
2,13.374979,-0.746873
3,0.366850,2.190031
4,-4.700183,0.252533
...,...,...
406703,-15.670398,-0.682137
406704,-4.650416,-1.756897
406705,-4.657418,-0.996850
406706,-4.699261,0.484306


In [21]:
# freq based sorted out the cluster labels
sorted_actual_labels = target_df['target'].value_counts().sort_values(ascending=False).index.to_numpy()

In [22]:
def mapped_labels(labels):
  sorted_pred_labels = pd.Series(labels).value_counts().sort_values(ascending=False).index.to_numpy()
  mapping_dict = dict(zip(sorted_pred_labels, sorted_actual_labels))

  return pd.Series(labels).replace(mapping_dict)

In [23]:
gmm = GaussianMixture(n_components=7, covariance_type='full', random_state=42)

In [24]:
gmm.fit(pca_covtype_df)

GaussianMixture(n_components=7, random_state=42)

In [26]:
# save the model to disk
pickle.dump(gmm, open("q2.pkl", "wb"))

### Question 2

In [27]:
'''DO NOT DELETE ANY PART OF CODE
We will run only the evaluation function.

Do not put anything outside of the functions, it will take time in evaluation.
You will have to create another code file to run the necessary code.
'''

# import statements

# other functions

def predict(test_set) :
    # find and load your best model
    # Do all preprocessings inside this function only.
    # predict on the test set provided
    '''
    'test_set' is a csv path "test.csv", You need to read the csv and predict using your model.
    '''
    
    # Read csv file
    test_df=pd.read_csv(test_set)

    # Droping some Columns
    test_df.drop(columns=['Hillshade_9am', 'Hillshade_Noon','Elevation'], inplace=True)

    # converting string values to int
    encoding_scheme = { 'Aspect': {'aspect_low': 0, 'aspect_medium': 1, 'aspect_high': 2, 'aspect_ultra': 3},
                    'Slope': {'slope_low': 0, 'slope_medium': 1, 'slope_high': 2, 'slope_ultra': 3},
                    'Horizontal_Distance_To_Fire_Points': {'low': 0, 'mid': 1, 'high': 2}
                   }

    categorical_col = ['Aspect', 'Slope', 'Horizontal_Distance_To_Fire_Points']


    for column in categorical_col:
      encoder = OrdinalEncoder(cols = column, return_df = True, mapping=[{'col':column, 'mapping':encoding_scheme[column]}])
      test_df[column] = encoder.fit_transform(test_df[column])
      test_df[column] = test_df[column].astype(np.int64)

    # PCA Encoding
    components = 2
    pca = PCA(n_components = components)
    pca_data = pca.fit_transform(test_df)

    pca_columns = []
    for (item1, item2) in zip(['Feature '] * components, np.arange(1, components + 1, 1)):
        pca_columns.append(item1 + str(item2))

    pca_covtype_df = pd.DataFrame(pca_data, columns=pca_columns)

    gmm = pickle.load(open("q2.pkl", 'rb'))
    gmm_labels=gmm.predict(pca_covtype_df)
    gmm_labels = mapped_labels(gmm_labels)

    '''
    prediction is a 1D 'list' of output labels. just a single python list.
    '''
    return gmm_labels.tolist()

In [28]:
ans=predict("/content/drive/MyDrive/DMG Assignment 3/covtype_train.csv")

In [30]:
set(ans)

{1, 2, 3, 4, 5, 6, 7}