# Import data and libraries

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

tree_data = pd.read_csv('../../data/covtype.data', header=None)
tree_cols = ['Elevation', 'Aspect', 'Slope', 'Horizontal_To_Hydrology',
             'Vertical_To_Hydrology', 'Horizontal_To_Roadways',
             'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm',
             'Horizontal_To_Fire'] + \
            [f'Wilderness_Area_{i}' for i in range(4)] + \
            [f'Soil_Type_{i}' for i in range(40)] + \
            ['Cover_Type']

# print(tree_data.describe())
tree_data.columns = tree_cols
print("Nan values in dataset:", tree_data.isna().sum().sum())
print(tree_data.head())

Nan values in dataset: 0
   Elevation  Aspect  Slope  Horizontal_To_Hydrology  Vertical_To_Hydrology  \
0       2596      51      3                      258                      0   
1       2590      56      2                      212                     -6   
2       2804     139      9                      268                     65   
3       2785     155     18                      242                    118   
4       2595      45      2                      153                     -1   

   Horizontal_To_Roadways  Hillshade_9am  Hillshade_Noon  Hillshade_3pm  \
0                     510            221             232            148   
1                     390            220             235            151   
2                    3180            234             238            135   
3                    3090            238             238            122   
4                     391            220             234            150   

   Horizontal_To_Fire  ...  Soil_Type_31  Soil_Ty

## Split training/testing data

In [3]:
x_tree = tree_data.drop('Cover_Type', axis=1)
y_tree = tree_data['Cover_Type']

x_train, x_test, y_train, y_test = train_test_split(x_tree, y_tree, test_size=0.2)


data = tree_data.to_numpy()

## Z Scaling Dataset

In [4]:
scalar = StandardScaler()

scaled = scalar.fit_transform(data[:,:10])

dataScaled = np.concatenate((scaled, data[:,10:54]), axis=1)

#tree_data['Cover_Type'].to_numpy().reshape(-1, 1)
x_zScaled_train, x_zScaled_test, y_zScaled_train, y_zScaled_test = train_test_split(dataScaled, tree_data['Cover_Type'].to_numpy().reshape(-1, 1), test_size=0.2)

## Min Max Scaled Dataset

In [5]:

scaler = MinMaxScaler()
dataScaled = scalar.fit_transform(data[:,:54])

x_minMaxScaled_train, x_minMaxScaled_test, y_minMaxScaled_train, y_minMaxScaled_test = train_test_split(dataScaled, tree_data['Cover_Type'].to_numpy(), test_size=0.2)


## L1 Scaled Dataset

## L2 Scaled Dataset

In [7]:
data_scaled = np.linalg.norm(data[:,:54])

x_l2Scaled_train, x_l2Scaled_test, y_l2Scaled_train, y_l2Scaled_test = train_test_split(dataScaled, tree_data['Cover_Type'].to_numpy(), test_size=0.2)


## Mini Batch Gradient Descent

In [11]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(max_iter=1000)

clf.fit(x_zScaled_train, y_zScaled_train.ravel())

print("Z Scaled Test Accuracy: ",clf.score(x_zScaled_test, y_zScaled_test))

Z Scaled Test Accuracy:  0.7230191991600905


In [12]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(max_iter=1000)

clf.fit(x_minMaxScaled_train, y_minMaxScaled_train.ravel())

print("Min Max Scaled Test Accuracy: ",clf.score(x_minMaxScaled_test, y_minMaxScaled_test))

Min Max Scaled Test Accuracy:  0.7217283546896379


In [13]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(max_iter=1000)

clf.fit(x_l2Scaled_train, y_l2Scaled_train.ravel())

print("L2 Scaled Test Accuracy: ",clf.score(x_l2Scaled_test, y_l2Scaled_test))

L2 Scaled Test Accuracy:  0.7248263814187241


## Stochastic Gradient Descent

In [14]:
from sklearn.linear_model import SGDClassifier

clf = SGDClassifier(max_iter=1000)

clf.fit(x_zScaled_train, y_zScaled_train.ravel())

print("Z Scaled Test Accuracy: ",clf.score(x_zScaled_test, y_zScaled_test))

Z Scaled Test Accuracy:  0.7086047692400368


In [15]:
from sklearn.linear_model import SGDClassifier

clf = SGDClassifier(max_iter=1000)

clf.fit(x_minMaxScaled_train, y_minMaxScaled_train.ravel())

print("Min Max Scaled Test Accuracy: ",clf.score(x_minMaxScaled_test, y_minMaxScaled_test))

Min Max Scaled Test Accuracy:  0.7053260242850873


In [16]:
from sklearn.linear_model import SGDClassifier

clf = SGDClassifier(max_iter=1000)

clf.fit(x_l2Scaled_train, y_l2Scaled_train.ravel())

print("L2 Scaled Test Accuracy: ",clf.score(x_l2Scaled_test, y_l2Scaled_test))

L2 Scaled Test Accuracy:  0.7121847112380919


## Batch Gradient Descent