## Model Training

#### 1.1 Import Data and Required Packages
##### Importing Pandas, Numpy, Matplotlib, Seaborn and Warings Library.

In [1]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
# Modelling
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import warnings

#### Import the CSV Data as Pandas DataFrame

In [2]:
df = pd.read_csv('data/raw.csv')

#### Show Top 5 Records

In [3]:
df.head()

Unnamed: 0,State,District,Crop,Crop_Year,Season,Area,Production,Yield
0,Jharkhand,RAMGARH,Potato,2013,Winter,1361.0,7544.0,5.54
1,Jammu and Kashmir,REASI,Moong(Green Gram),2015,Kharif,13.0,9.0,0.7
2,Haryana,GURGAON,Sweet potato,2009,Whole Year,39.0,800.0,20.51
3,Uttar Pradesh,JALAUN,Sannhamp,2010,Kharif,9.0,3.0,0.33
4,Tamil Nadu,MADURAI,Sugarcane,2006,Whole Year,6006.0,656204.0,109.26


#### Preparing X and Y variables

In [4]:
X = df.drop(columns=['Yield'], axis=1)

In [5]:
X.head()

Unnamed: 0,State,District,Crop,Crop_Year,Season,Area,Production
0,Jharkhand,RAMGARH,Potato,2013,Winter,1361.0,7544.0
1,Jammu and Kashmir,REASI,Moong(Green Gram),2015,Kharif,13.0,9.0
2,Haryana,GURGAON,Sweet potato,2009,Whole Year,39.0,800.0
3,Uttar Pradesh,JALAUN,Sannhamp,2010,Kharif,9.0,3.0
4,Tamil Nadu,MADURAI,Sugarcane,2006,Whole Year,6006.0,656204.0


In [6]:
print("Categories in 'State' variable:     ", end=" ")
print(df['State'].unique())

print("Categories in 'District' variable:  ", end=" ")
print(df['District'].unique())

print("Categories in 'Crop' variable:      ", end=" ")
print(df['Crop'].unique())

print("Categories in 'Season' variable:    ", end=" ")
print(df['Season'].unique())


Categories in 'State' variable:      ['Jharkhand' 'Jammu and Kashmir' 'Haryana' 'Uttar Pradesh' 'Tamil Nadu'
 'Tripura' 'Rajasthan' 'Andhra Pradesh' 'Karnataka' 'Madhya Pradesh'
 'Assam' 'Bihar' 'West Bengal' 'Maharashtra' 'Odisha' 'Chhattisgarh'
 'Meghalaya' 'Gujarat' 'Arunachal Pradesh' 'Himachal Pradesh' 'Manipur'
 'Nagaland' 'Uttarakhand' 'Kerala' 'Goa' 'Mizoram' 'Telangana' 'Punjab'
 'THE DADRA AND NAGAR HAVELI' 'Laddak' 'Puducherry' 'Delhi'
 'Andaman and Nicobar Island' 'CHANDIGARH' 'Sikkim' 'Daman and Diu'
 'Dadra and Nagar Haveli']
Categories in 'District' variable:   ['RAMGARH' 'REASI' 'GURGAON' 'JALAUN' 'MADURAI' 'BALLIA' 'FARIDABAD'
 'NORTH TRIPURA' 'PALI' 'EAST GODAVARI' 'UDUPI' 'BHIND' 'DIMA HASAO'
 'SAMASTIPUR' 'KATIHAR' 'BIRBHUM' 'DAVANGERE' 'NAGPUR' 'SATARA' 'ARARIA'
 'SONEPUR' 'SRIKAKULAM' 'MANDYA' 'GOPALGANJ' 'BULANDSHAHR' 'BALRAMPUR'
 'VARANASI' 'NALGONDA' 'SIROHI' 'LALITPUR' 'BEGUSARAI' 'BIDAR'
 'GARIYABAND' 'KANCHIPURAM' 'HOSHANGABAD' 'EAST GARO HILLS'
 'SIDDHARTH 

In [7]:
y = df['Yield']

In [8]:
y

0             5.54
1             0.70
2            20.51
3             0.33
4           109.26
            ...   
272296        4.40
272297        0.54
272298    14054.41
272299        0.22
272300        0.41
Name: Yield, Length: 272301, dtype: float64

In [9]:
# Create Column Transformer with 3 types of transformers
num_features = X.select_dtypes(exclude="object").columns
cat_features = X.select_dtypes(include="object").columns

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, cat_features),
         ("StandardScaler", numeric_transformer, num_features),        
    ]
)

In [10]:
X = preprocessor.fit_transform(X)

In [11]:
X

<272301x809 sparse matrix of type '<class 'numpy.float64'>'
	with 1906107 stored elements in Compressed Sparse Row format>

In [12]:
X.shape

(272301, 809)

In [13]:
# separate dataset into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape

((217840, 809), (54461, 809))

#### Create an Evaluate Function to give all metrics after model Training

In [14]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [None]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(), 
    "CatBoosting Regressor": CatBoostRegressor(verbose=False),
    "AdaBoost Regressor": AdaBoostRegressor()
}
model_list = []
r2_list =[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)

    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    r2_list.append(model_test_r2)
    
    print('='*35)
    print('\n')

Linear Regression
Model performance for Training set
- Root Mean Squared Error: 418.3661
- Mean Absolute Error: 61.7393
- R2 Score: 0.7933
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 396.9123
- Mean Absolute Error: 62.3833
- R2 Score: 0.8153


Lasso
Model performance for Training set
- Root Mean Squared Error: 421.6182
- Mean Absolute Error: 50.5537
- R2 Score: 0.7900
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 399.1375
- Mean Absolute Error: 50.9260
- R2 Score: 0.8132


Ridge
Model performance for Training set
- Root Mean Squared Error: 418.4114
- Mean Absolute Error: 61.7342
- R2 Score: 0.7932
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 396.6721
- Mean Absolute Error: 62.3204
- R2 Score: 0.8155




### Results

In [None]:
pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2_Score']).sort_values(by=["R2_Score"],ascending=False)

## Linear Regression

In [None]:
lin_model = LinearRegression(fit_intercept=True)
lin_model = lin_model.fit(X_train, y_train)
y_pred = lin_model.predict(X_test)
score = r2_score(y_test, y_pred)*100
print(" Accuracy of the model is %.2f" %score)

## Plot y_pred and y_test

In [None]:
plt.scatter(y_test,y_pred);
plt.xlabel('Actual');
plt.ylabel('Predicted');

In [None]:
sns.regplot(x=y_test,y=y_pred,ci=None,color ='red');

#### Difference between Actual and Predicted Values

In [None]:
pred_df=pd.DataFrame({'Actual Value':y_test,'Predicted Value':y_pred,'Difference':y_test-y_pred})
pred_df