In [1]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd

In [3]:
# url = "https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.data.gz"
data = pd.read_csv('../covtype.data', header=None)
data = data.iloc[:1000, :]


columns = [
    'Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology',
    'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways',
    'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm',
    'Horizontal_Distance_To_Fire_Points'
]
wilderness_areas = [f'Wilderness_Area_{i}' for i in range(1, 5)]
soil_types = [f'Soil_Type_{i}' for i in range(1, 41)]
columns.extend(wilderness_areas)
columns.extend(soil_types)
columns.append('Cover_Type')
data.columns = columns
data

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type_32,Soil_Type_33,Soil_Type_34,Soil_Type_35,Soil_Type_36,Soil_Type_37,Soil_Type_38,Soil_Type_39,Soil_Type_40,Cover_Type
0,2596,51,3,258,0,510,221,232,148,6279,...,0,0,0,0,0,0,0,0,0,5
1,2590,56,2,212,-6,390,220,235,151,6225,...,0,0,0,0,0,0,0,0,0,5
2,2804,139,9,268,65,3180,234,238,135,6121,...,0,0,0,0,0,0,0,0,0,2
3,2785,155,18,242,118,3090,238,238,122,6211,...,0,0,0,0,0,0,0,0,0,2
4,2595,45,2,153,-1,391,220,234,150,6172,...,0,0,0,0,0,0,0,0,0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,2982,10,18,30,7,4562,197,200,136,2538,...,0,0,0,0,0,0,0,0,0,1
996,3032,339,14,95,23,4710,189,219,169,518,...,0,0,0,0,0,0,0,0,0,1
997,2896,72,16,319,44,3294,236,208,98,2726,...,0,0,0,0,0,0,0,0,0,2
998,2846,135,2,0,0,3056,222,238,152,2349,...,0,0,0,0,0,0,0,0,0,1


In [11]:
print(len(data.iloc[0].values))
print(len([2804, 134, 9, 60, -4, 215, 237, 222, 142, 972,
    0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]))
print(list(data.iloc[0].values))


55
41
[2596, 51, 3, 258, 0, 510, 221, 232, 148, 6279, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5]


In [25]:
X = data.drop('Cover_Type', axis=1)
y = data['Cover_Type']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [26]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [27]:
random_forest = RandomForestClassifier()
random_forest.fit(X_train, y_train)
forest_pred = random_forest.predict(X_test)
forest_acc = accuracy_score(y_test, forest_pred)
print("Accuracy Score - Random Forest = ", forest_acc)

Accuracy Score - Random Forest =  0.825


In [28]:
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, y_train)
tree_pred = decision_tree.predict(X_test)
tree_acc = accuracy_score(y_test, tree_pred)
print("Accuracy Score - Decision Tree = ", tree_acc)

Accuracy Score - Decision Tree =  0.73


In [29]:
naive_bayes = GaussianNB()
naive_bayes.fit(X_train, y_train)
bayes_pred = naive_bayes.predict(X_test)
bayes_acc = accuracy_score(y_test, bayes_pred)
print("Accuracy Score - Naive-Bayes = ", bayes_acc)

Accuracy Score - Naive-Bayes =  0.265


In [30]:
kneighbors = KNeighborsClassifier()
kneighbors.fit(X_train, y_train)
kneighbors_pred = kneighbors.predict(X_test)
kneighbors_acc = accuracy_score(y_test, kneighbors_pred)
print("Accuracy Score - K-Nearest Neighbors (KNN) = ", kneighbors_acc)

Accuracy Score - K-Nearest Neighbors (KNN) =  0.77


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [31]:
logistic_reg = LogisticRegression()
logistic_reg.fit(X_train, y_train)
logistic_reg_pred = logistic_reg.predict(X_test)
logistic_reg_acc = accuracy_score(y_test, logistic_reg_pred)
print("Accuracy Score - Logistic Regression = ", logistic_reg_acc)

Accuracy Score - Logistic Regression =  0.745


In [32]:
svc = SVC()
svc.fit(X_train, y_train)
svc_pred = svc.predict(X_test)
svc_acc = accuracy_score(y_test, tree_pred)
print("Accuracy Score - SVC = ", svc_acc)

Accuracy Score - SVC =  0.73
