# Modelling and Error Analysis

In [14]:
# libraries required are imported
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from random import sample
import warnings # alert the user of some condition in program
warnings.filterwarnings('ignore') # There is warning if there are some absolete of
                                  # certain programming elements such as keywords or class, etc

In [15]:
#To see all the columns of dataset
pd.set_option('display.max_columns', 50)

### Import Dataset 

Import cleaned dataset

In [16]:
# Read dataset cleaned
dataset = pd.read_csv('Clean_Dataset.csv')
print(dataset)

        Unnamed: 0         a         e          i          om           w  \
0                0  2.769165  0.076009  10.594067   80.305532   73.597694   
1                1  2.772466  0.230337  26.577378  173.080063  310.048857   
2                2  2.669150  0.256942  12.988919  169.852760  248.138626   
3                3  2.361418  0.088721   7.141771  103.810804  150.728541   
4                4  2.574249  0.191095   5.366988  141.576605  358.687607   
...            ...       ...       ...        ...         ...         ...   
136401      797835  3.155975  0.343178  26.577378  115.532995  136.849398   
136402      797860  3.171225  0.159119  26.577378  309.036573   19.746812   
136403      798077  2.548410  0.076071  11.593237  246.298656  170.090810   
136404      799752  3.051336  0.287449  14.456779  343.917822  342.614839   
136405      810375  2.417477  0.109001   4.525668  148.244819   31.949854   

               q        ad     per_y  data_arc  condition_code  n_obs_used 

In [17]:
# dataset.head() prints/shows first 5 rows of dataset
dataset.head()

Unnamed: 0.1,Unnamed: 0,a,e,i,om,w,q,ad,per_y,data_arc,condition_code,n_obs_used,H,neo,pha,albedo,moid,class,n,per,ma,Diameter
0,0,2.769165,0.076009,10.594067,80.305532,73.597694,2.558684,2.979647,4.608202,8822.0,0,1002.0,11.85,N,N,0.09,1.59478,MBA,0.213885,1683.145708,77.372096,10.24
1,1,2.772466,0.230337,26.577378,173.080063,310.048857,2.133865,3.411067,4.616444,14947.5,0,2145.0,11.85,N,N,0.101,1.23324,MBA,0.213503,1686.155999,59.699133,10.24
2,2,2.66915,0.256942,12.988919,169.85276,248.138626,1.983332,3.354967,4.360814,14947.5,0,2145.0,11.85,N,N,0.214,1.03454,MBA,0.226019,1592.787285,34.925016,10.24
3,3,2.361418,0.088721,7.141771,103.810804,150.728541,2.151909,2.570926,3.628837,14947.5,0,2145.0,11.85,N,N,0.3905,1.13948,MBA,0.271609,1325.432765,95.861936,10.24
4,4,2.574249,0.191095,5.366988,141.576605,358.687607,2.082324,3.066174,4.130323,14947.5,0,2145.0,11.85,N,N,0.274,1.09589,MBA,0.238632,1508.600458,282.366289,10.24


In [18]:
# dropped Unnamed column's
dataset.drop('Unnamed: 0', axis=1, inplace=True)
dataset.head()

Unnamed: 0,a,e,i,om,w,q,ad,per_y,data_arc,condition_code,n_obs_used,H,neo,pha,albedo,moid,class,n,per,ma,Diameter
0,2.769165,0.076009,10.594067,80.305532,73.597694,2.558684,2.979647,4.608202,8822.0,0,1002.0,11.85,N,N,0.09,1.59478,MBA,0.213885,1683.145708,77.372096,10.24
1,2.772466,0.230337,26.577378,173.080063,310.048857,2.133865,3.411067,4.616444,14947.5,0,2145.0,11.85,N,N,0.101,1.23324,MBA,0.213503,1686.155999,59.699133,10.24
2,2.66915,0.256942,12.988919,169.85276,248.138626,1.983332,3.354967,4.360814,14947.5,0,2145.0,11.85,N,N,0.214,1.03454,MBA,0.226019,1592.787285,34.925016,10.24
3,2.361418,0.088721,7.141771,103.810804,150.728541,2.151909,2.570926,3.628837,14947.5,0,2145.0,11.85,N,N,0.3905,1.13948,MBA,0.271609,1325.432765,95.861936,10.24
4,2.574249,0.191095,5.366988,141.576605,358.687607,2.082324,3.066174,4.130323,14947.5,0,2145.0,11.85,N,N,0.274,1.09589,MBA,0.238632,1508.600458,282.366289,10.24


In [19]:
# Segregate/Separate the data into Numeric and Catergorical ones
categorical, numeric = [], []
for ele in dataset.columns:
    if dataset[ele].dtype == 'object':
        categorical.append(ele)
    else:
        numeric.append(ele)
print("Categorical Data : ", categorical)
print("Numeric Data : ", numeric)

Categorical Data :  ['neo', 'pha', 'class']
Numeric Data :  ['a', 'e', 'i', 'om', 'w', 'q', 'ad', 'per_y', 'data_arc', 'condition_code', 'n_obs_used', 'H', 'albedo', 'moid', 'n', 'per', 'ma', 'Diameter']


## Converting Categorical Data into Numerical 

### Creating Dummy Variables 

You can go through converting categorical data into numerical using dummy variables here: https://www.geeksforgeeks.org/convert-a-categorical-variable-into-dummy-variables/

In [20]:
#print("Categorical Data : ", categorical)

In [21]:
# one-hot encoding columns using 'get_dummies'
# 'Dummies': a binary variable that indicates whether a separate categorical variable takes on a specific value.
# get_dummies is used for data manipulation.
# drop_first = True :--> reduces the extra column created during dummy variable creation(Hence it reduces the correlations
# created among dummy variables).
#dummy_neo = pd.get_dummies(dataset['neo'], drop_first = True)
#dummy_neo.head()

In [22]:
#dummy_pha = pd.get_dummies(dataset['pha'], drop_first = True)
#dummy_pha.head()

In [23]:
#dummy_class = pd.get_dummies(dataset['class'], drop_first = True)
#dummy_class.head()

In [24]:
#new_data = pd.concat([dataset, dummy_neo, dummy_pha, dummy_class], axis=1)
#new_data.drop(['neo', 'pha', 'class'], axis=1, inplace=True)
#new_data

In [25]:
#new_data.head()

## Split data into Train and Test Data 

from sklearn.model_selection import train_test_split

you can go through this here: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

In [26]:
# importing and defining the model
# "sklearn.model_selection.train_test_split": Split arrays or matrices into random train and test subsets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(new_data.drop('Diameter', axis=1), new_data['Diameter'], test_size=0.2, random_state=1)

### Scaling the data 

In [27]:
# importing and defining model
# "sklearn.preprocessing import StandardScaler": Standardize features by removing the mean and scaling to unit variance
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()

# "scale.fit_transform" 
X_train = scale.fit_transform(X_train)

#"scale.transform"
X_test = scale.transform(X_test)

### Evaluation 

You can go through all these here:

"from sklearn.metrics import mean_absolute_error": https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_absolute_error.html

"from sklearn.metrics import mean_squared_error": https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_squared_error.html

"from sklearn.metrics import r2_score": https://scikit-learn.org/stable/modules/generated/sklearn.metrics.r2_score.html

"from math import sqrt": https://www.geeksforgeeks.org/python-math-function-sqrt/
 

In [28]:
# importing modules
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from math import sqrt
import seaborn as sns

In [29]:
def evaluate(y_pred, y_actual):
    mae = mean_absolute_error(y_actual, y_pred)
    mse = mean_squared_error(y_actual, y_pred)
    rmse = sqrt(mse)
    r2 = r2_score(y_actual, y_pred)
    
    print("Mean Absolute Error :->", mae)
    print("Mean Squared Error :->", mse)
    print("Root Mean Squared Error :->", rmse)
    print("R2_Score :->", r2)
    
    return mae, mse, rmse, r2

In [30]:
# "Applying Algorithms"
# for storing algorithms name and it's performanace
algo_score = {}

## Random Forest Regressor

sklearn.ensemble.RandomForestRegressor

you can go through this here: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html

In [31]:
# training the random forest regressor model on the whole dataset
from sklearn.ensemble import RandomForestRegressor
RF_regressor = RandomForestRegressor()
RF_regressor.fit(X_train, y_train)

RandomForestRegressor()

In [32]:
# predicting the test set results
y_pred_RF = RF_regressor.predict(X_test)
mae, mse, rmse, r2 = evaluate(y_test, y_pred_RF)

Mean Absolute Error :-> 0.26880090902426895
Mean Squared Error :-> 0.17606453451299758
Root Mean Squared Error :-> 0.419600446273592
R2_Score :-> 0.9696602221807307


In [33]:
# saved calculated score of algorithm
algo_score['Random Forest Regressor'] = r2

## K Nearest Neighbor Regressor 

In [34]:
from sklearn.neighbors import KNeighborsRegressor
KNN = KNeighborsRegressor(n_neighbors = 4)
KNN.fit(X_train, y_train)

KNeighborsRegressor(n_neighbors=4)

In [35]:
# predicting the test set results
y_pred_KNN = KNN.predict(X_test)
mae, mse, rmse, r2 = evaluate(y_test, y_pred_KNN)

Mean Absolute Error :-> 0.4779474103804707
Mean Squared Error :-> 0.4730649601683345
Root Mean Squared Error :-> 0.6877971795292087
R2_Score :-> 0.9116504039294485


In [36]:
# saved calculated score of algorithm
algo_score['K Nearest Neighbor Regressor'] = r2

## Linear Regression 

from sklearn.linear_model import LinearRegression

you can go through this here: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html

In [37]:
# training the linear regressor model on the whole dataset
from sklearn.linear_model import LinearRegression
linear_regression = LinearRegression()

# training
linear_regression.fit(X_train, y_train)

LinearRegression()

In [38]:
# predicting the test set results
y_pred_LR = linear_regression.predict(X_test)
mae, mse, rmse, r2 = evaluate(y_test, y_pred_LR)

Mean Absolute Error :-> 0.5988856865940341
Mean Squared Error :-> 0.6338630346133242
Root Mean Squared Error :-> 0.7961551573740663
R2_Score :-> 0.8812456820746711


In [39]:
# saved calculated score of algorithm
algo_score['Linear Regression'] = r2

## Decision Tree Regressor 

sklearn.tree.DecisionTreeRegressor

you can go through this here: https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html

In [40]:
# training the decision tree regressor model on the whole dataset
from sklearn.tree import DecisionTreeRegressor
DT = DecisionTreeRegressor()
DT.fit(X_train, y_train)

DecisionTreeRegressor()

In [41]:
# predicting the test set results
y_pred_DT = DT.predict(X_test)
mae, mse, rmse, r2 = evaluate(y_test, y_pred_DT)

Mean Absolute Error :-> 0.385107543435247
Mean Squared Error :-> 0.3580716550839381
Root Mean Squared Error :-> 0.5983908882026348
R2_Score :-> 0.9402192720134317


In [42]:
# saved calculated score of algorithm
algo_score['Decision Tree Regressor'] = r2

## XG Boost Regressor

You can go through this here: https://xgboost.readthedocs.io/en/stable/tutorials/index.html

In [43]:
# training the XG Boost regressor model on the whole dataset
from xgboost import XGBRegressor
XGB = XGBRegressor()
XGB.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.300000012,
             max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=4,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [44]:
# predicting the test set results
y_pred_XGB = XGB.predict(X_test)
mae, mse, rmse, r2 = evaluate(y_test, y_pred_XGB)

Mean Absolute Error :-> 0.275480797861573
Mean Squared Error :-> 0.17380856857402793
Root Mean Squared Error :-> 0.4169035482866846
R2_Score :-> 0.9703281842649443


In [45]:
# saved calculated score of algorithm
algo_score['XG Boost Regressor'] = r2