# **Forest Cover Type Prediction**

# **Importing libraries**

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# **Read the data**

In [2]:
df=pd.read_csv("../input/forest-cover-type-dataset/covtype.csv")
df.head()

In [3]:
df.tail(10)

# **About the data**

In [4]:
df.shape

In [5]:
df.nunique()

In [6]:
df['Cover_Type'].value_counts()

In [7]:
df.info()


In [8]:
df.describe()

In [9]:
df.drop_duplicates()

There is no duplicates in the dataset

In [10]:
df.isnull().sum()

# **EDA**

In [13]:
sns.histplot(df['Elevation'] , color = 'red')

In [17]:
sns.displot(df['Aspect'] , color = 'pink')

In [20]:
sns.displot(df['Slope'],color = "blue")

In [11]:
df.hist()

In [12]:
import seaborn as sns
corr=df.corr()
features=corr.index
plt.figure(figsize=(30,30))
#plot heat map
a=sns.heatmap(df[features].corr(),annot=True,cmap="RdYlGn")

In [13]:
sns.set_style('whitegrid')
sns.countplot(x='Cover_Type',data=df,palette='RdBu_r')

Imbalanced multi-class 

# **Handling outliers**

In [21]:
for column in df.columns[0:-1]:
    plt.figure(figsize=(10,5))
    sns.boxplot(x=(column),data=df)

In [25]:
columns = ['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology',
       'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways',
       'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm',
       'Horizontal_Distance_To_Fire_Points', 'Wilderness_Area1',
       'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4',
       'Soil_Type1', 'Soil_Type2', 'Soil_Type3', 'Soil_Type4', 'Soil_Type5',
       'Soil_Type6', 'Soil_Type7', 'Soil_Type8', 'Soil_Type9', 'Soil_Type10',
       'Soil_Type11', 'Soil_Type12', 'Soil_Type13', 'Soil_Type14',
       'Soil_Type15', 'Soil_Type16', 'Soil_Type17', 'Soil_Type18',
       'Soil_Type19', 'Soil_Type20', 'Soil_Type21', 'Soil_Type22',
       'Soil_Type23', 'Soil_Type24', 'Soil_Type25', 'Soil_Type26',
       'Soil_Type27', 'Soil_Type28', 'Soil_Type29', 'Soil_Type30',
       'Soil_Type31', 'Soil_Type32', 'Soil_Type33', 'Soil_Type34',
       'Soil_Type35', 'Soil_Type36', 'Soil_Type37', 'Soil_Type38',
       'Soil_Type39', 'Soil_Type40']

In [27]:
Q1=df[columns].quantile(0.25)
Q3=df[columns].quantile(0.75)
IQR=Q3-Q1
print ("IQR: ",IQR)
lower_limit=Q1-(1.5*IQR)
print("lower_limit: ",lower_limit)
upper_limit=Q3+(1.5*IQR)
print("upper_limit: ",upper_limit)
df[columns]=np.where(df[columns]<lower_limit,lower_limit,df[columns])
df[columns]=np.where(df[columns]>upper_limit,upper_limit,df[columns])


In [28]:
for column in df.columns[0:-1]:
    plt.figure(figsize=(10,5))
    sns.boxplot(x=(column),data=df)

**Standardization**

In [29]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
columns_scale=['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology',
       'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways',
       'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm',
       'Horizontal_Distance_To_Fire_Points']
df[columns_scale]=scaler.fit_transform(df[columns_scale])
df.head()

# **Train Test Split**

In [30]:
y = df['Cover_Type']
X = df.drop(['Cover_Type'],axis=1)

In [31]:
X

In [32]:
y

**SMOTE Oversampling for Multi-Class Classification**
- (Synthetic Minority Oversampling TEchnique, or SMOTE)
Oversampling refers to copying or synthesizing new examples of the minority classes so that the number of examples in the minority class better resembles or matches the number of examples in the majority classes.

In [33]:
from imblearn.over_sampling import SMOTE

oversample = SMOTE(random_state=0)
X, y = oversample.fit_resample(X, y)

In [36]:
from collections import Counter
counter = Counter(y)
for k,v in counter.items():
    per = v / len(y) * 100
    print('Class=%d, n=%d (%.3f%%)' % (k, v, per))
# plot the distribution
plt.bar(counter.keys(), counter.values())
plt.show()

In [37]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

In [38]:
print("X_train:", X_train.shape)
print("X_test:", X_test.shape)
print("y_train:", y_train.shape)
print("y_test:", y_test.shape)

# **Model building and Evaluation**

In [39]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(X_train, y_train)

In [40]:
model.score(X_test,y_test)

In [43]:
from sklearn.metrics import plot_confusion_matrix
print(plot_confusion_matrix(model,X_test,y_test ))

In [42]:
from sklearn.metrics import classification_report
y_pred = model.predict(X_test)
print(classification_report(y_test,y_pred))