# Random Forest


## Importing the libraries

In [84]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [85]:
# dataset = pd.read_csv("crop_yield_prediction.csv")
# dataset = pd.read_csv("module_2_data.csv")
dataset = pd.read_csv("module_2_more_than_mean_data.csv")
dataset.head()

Unnamed: 0,State_Name,District_Name,Season,Crop,Area,Production
0,Maharashtra,AHMEDNAGAR,Kharif,Chick Peas,40800.0,18600.0
1,Maharashtra,AHMEDNAGAR,Kharif,Sugarcane,45900.0,38940.0
2,Maharashtra,AHMEDNAGAR,Rabi,Wheat,79700.0,87100.0
3,Maharashtra,AHMEDNAGAR,Kharif,Sugarcane,59600.0,5231800.0
4,Maharashtra,AHMEDNAGAR,Rabi,Chick Peas,59600.0,40900.0


In [86]:
dataset.dropna(inplace=True)

In [87]:
dataset.shape

(2028, 6)

In [88]:
dataset.describe()

Unnamed: 0,Area,Production
count,2028.0,2028.0
mean,76539.413215,478949.2
std,72194.623663,1742710.0
min,24100.0,1200.0
25%,36400.0,26500.0
50%,52400.0,52350.0
75%,79200.0,139550.0
max,558800.0,20049700.0


## Feature Scaling

In [89]:
from sklearn import preprocessing


std_scale = preprocessing.StandardScaler().fit(dataset[['Production']])
dataset.Production = std_scale.transform(dataset[['Production']])

In [90]:
dataset.Area = std_scale.transform(dataset[['Area']])

In [91]:
from sklearn.preprocessing import LabelEncoder
  
le = LabelEncoder()
  
dataset['Crop']= le.fit_transform(dataset['Crop'])
dataset['District_Name']= le.fit_transform(dataset['District_Name'])
dataset['Season']= le.fit_transform(dataset['Season'])


In [92]:
dataset.head()

Unnamed: 0,State_Name,District_Name,Season,Crop,Area,Production
0,Maharashtra,0,0,2,-0.25148,-0.264222
1,Maharashtra,0,0,12,-0.248553,-0.252548
2,Maharashtra,0,1,13,-0.229153,-0.224906
3,Maharashtra,0,0,12,-0.24069,2.727949
4,Maharashtra,0,1,2,-0.24069,-0.251423


In [93]:
X = dataset.iloc[:, 1:-1].values
y = dataset.iloc[:,-1].values

## Splitting the dataset into the Training set and test set

In [94]:
from sklearn.model_selection import  train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

## Training the Random Forest Regression model on the Training set

In [95]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators=22, random_state=0)
regressor.fit(X_train, y_train)


RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=22, n_jobs=None, oob_score=False,
                      random_state=0, verbose=0, warm_start=False)

## Predicting the Test set results

In [96]:
y_pred = regressor.predict(X_test)

np.set_printoptions(precision=2)

print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)), 1))

[[-2.48e-01 -2.49e-01]
 [-1.31e-01 -1.69e-01]
 [-2.48e-01 -2.66e-01]
 [-2.66e-01 -2.70e-01]
 [-1.69e-01 -2.45e-01]
 [-2.58e-01 -2.66e-01]
 [-1.56e-01 -1.51e-01]
 [-2.42e-01 -2.35e-01]
 [-2.12e-01 -2.02e-01]
 [-2.42e-01 -2.48e-01]
 [-2.59e-01 -2.64e-01]
 [-2.26e-01 -2.39e-01]
 [-2.48e-01 -2.51e-01]
 [-2.36e-01 -2.38e-01]
 [-2.63e-01 -2.62e-01]
 [-5.59e-02  4.22e-03]
 [-1.00e-01 -1.77e-01]
 [-2.71e-01 -2.67e-01]
 [-2.47e-01 -2.53e-01]
 [-1.67e-01 -2.13e-01]
 [-1.45e-01 -1.20e-01]
 [ 1.39e+00  1.51e+00]
 [-2.61e-01 -2.59e-01]
 [-8.31e-02 -1.06e-01]
 [-2.61e-01 -2.66e-01]
 [-2.49e-01 -2.51e-01]
 [-2.34e-01 -2.30e-01]
 [-2.54e-01 -2.46e-01]
 [-2.67e-01 -2.63e-01]
 [ 4.90e+00  4.56e+00]
 [-2.39e-01 -2.48e-01]
 [-2.39e-01 -2.50e-01]
 [ 2.61e-01  1.66e-01]
 [-2.50e-01 -2.43e-01]
 [-2.39e-01 -2.44e-01]
 [-1.77e-01 -1.83e-01]
 [-2.61e-01 -2.62e-01]
 [-1.48e-01 -1.85e-01]
 [-2.57e-01 -2.59e-01]
 [-9.29e-02  8.85e-02]
 [-2.56e-01 -2.45e-01]
 [-1.03e-01 -1.13e-01]
 [-2.23e-01 -2.24e-01]
 [-2.51e-01

## Model Evaluation using R squared

In [97]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.9178934337272994

## Mean Square Error and Root MSE




In [98]:
from sklearn.metrics import mean_squared_error
import math
print(mean_squared_error(y_test, y_pred))
print(math.sqrt(mean_squared_error(y_test, y_pred)))

0.10110181230039411
0.31796511176604597


## Mean Absolute Error(MAE)

In [99]:
from sklearn.metrics import mean_absolute_error
print(mean_absolute_error(y_test, y_pred))

0.07098303800669493
