In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import confusion_matrix, accuracy_score

In [None]:
data = pd.read_csv("/kaggle/input/forest-cover-type-kernels-only/train.csv")

In [None]:
data.head()

In [None]:
data.shape

In [None]:
column_names = data.columns
column_names = list(column_names)
print(column_names)

In [None]:
column_names.index('Soil_Type1')

In [None]:
for name in column_names[15:]:
    data[name]=data[name].astype('int8')

In [None]:
data.info()

In [None]:
x = data.iloc[:, 1:-1].values
y = data.iloc[:, -1].values
print(x.shape)
print(y.shape)

In [None]:
#Split data set
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = 0)

In [None]:
#Standardisation
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [None]:
pca = PCA(.95)
pca.fit(x_train)
x_train_pca = pca.transform(x_train)
x_test_pca = pca.transform(x_test)

In [None]:
pca_n = pca.explained_variance_ratio_
print(pca_n)

In [None]:
len(pca_n)

# SVM Linear

In [None]:
classifier_svc = SVC(kernel = 'linear', random_state=0)
classifier_svc.fit(x_train_pca, y_train)
# Faire de nouvelles prédictions
y_pred_svc = classifier_svc.predict(x_test_pca)

In [None]:
cm_svc = confusion_matrix(y_test, y_pred_svc)
cm_svc

In [None]:
accu_svc = accuracy_score(y_test, y_pred_svc)
accu_svc

# SVM RBF

In [None]:
classifier_rbf = SVC(kernel = 'rbf', random_state = 0)
classifier_rbf.fit(x_train_pca, y_train)

y_pred_rbf = classifier_rbf.predict(x_test_pca)

conf_mat_rbf = confusion_matrix(y_test, y_pred_rbf)
print(conf_mat_rbf)

acc_score_rbf = accuracy_score(y_test, y_pred_rbf)
print('Accuracy rbf =', acc_score_rbf)

# Logistic Regression

In [None]:
classifier_log = LogisticRegression(random_state = 0)
classifier_log.fit(x_train_pca, y_train)

y_pred = classifier_log.predict(x_test_pca)

conf_mat_log = confusion_matrix(y_test, y_pred)
print(conf_mat_log)

acc_score_log = accuracy_score(y_test, y_pred)
print('Accuracy logistique =', acc_score_log)

# Decision Tree

In [None]:
classifier_tree = DecisionTreeClassifier(random_state=0)
classifier_tree.fit(x_train_pca, y_train)

y_pred_tree = classifier_tree.predict(x_test_pca)
accu_tree = accuracy_score(y_test, y_pred_tree)

In [None]:
print(accu_tree)

# Random Forest

In [None]:
classifier_forest = RandomForestClassifier(n_estimators = 200, random_state=0)
classifier_forest.fit(x_train_pca, y_train)

y_pred_forest = classifier_forest.predict(x_test_pca)

accu_forest = accuracy_score(y_test, y_pred_forest)
print(accu_forest)

In [None]:
cm_forest = confusion_matrix(y_test, y_pred_forest)
print(cm_forest)

# Prediction with Random Forest Model

In [None]:
data_T = pd.read_csv("/kaggle/input/forest-cover-type-kernels-only/test.csv")

In [None]:
#Transform int64 to int8 
for name in column_names[15:-1]:
    data_T[name]=data_T[name].astype('int8')

In [None]:
#Delete id column 
data_T = data_T.iloc[:, 1:].values
print(data_T.shape)

In [None]:
#Scaler
data_T_scaled = scaler.transform(data_T)

In [None]:
#PCA
data_T_pca = pca.transform(data_T_scaled)

In [None]:
#Prediction 
data_T_pred = classifier_forest.predict(data_T_pca)

In [None]:
print(data_T_pred)

In [None]:
plt.hist(data_T_pred)
plt.xlabel('Types of Forest Cover')
plt.show()