# Model building for Quality prediction in Mining process

### Importing the libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### Importing the clean dataset

In [2]:
dataset=pd.read_csv('ML_data.csv')
dataset

Unnamed: 0.1,Unnamed: 0,% Iron Feed,% Silica Feed,Starch Flow,Amina Flow,Ore Pulp pH,Ore Pulp Density,Flotation Column 01 Air Flow,Flotation Column 02 Air Flow,Flotation Column 03 Air Flow,Flotation Column 06 Air Flow,Flotation Column 07 Air Flow,Flotation Column 03 Level,Flotation Column 04 Level,Flotation Column 05 Level,Flotation Column 06 Level,Flotation Column 07 Level,% Iron Concentrate,% Silica Concentrate
0,0,55.20,16.98,3019.53,557.434,10.06640,1.74000,249.214,253.235,250.576,250.225,250.884,424.954,443.558,502.255,446.370,523.344,66.91,1.31
1,1,55.20,16.98,3024.41,563.965,10.06720,1.74000,249.719,250.532,250.862,250.137,248.994,432.939,448.086,496.363,445.922,498.075,66.91,1.31
2,2,55.20,16.98,3043.46,568.054,10.06800,1.74000,249.741,247.874,250.313,251.345,248.071,434.610,449.688,484.411,447.826,458.567,66.91,1.31
3,3,55.20,16.98,3047.36,568.665,10.06890,1.74000,249.917,254.487,250.049,250.422,251.147,442.865,446.210,471.411,437.690,427.669,66.91,1.31
4,4,55.20,16.98,3033.69,558.167,10.06970,1.74000,250.203,252.136,249.895,249.983,248.928,450.523,453.670,462.598,443.682,425.679,66.91,1.31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
717883,717883,49.75,23.20,2710.94,441.052,9.62129,1.65365,302.344,298.786,299.163,346.794,313.695,872.008,418.725,497.548,446.357,416.892,64.27,1.71
717884,717884,49.75,23.20,2692.01,473.436,9.62063,1.65352,303.013,301.879,299.487,330.023,236.700,864.409,418.377,506.398,372.995,426.337,64.27,1.71
717885,717885,49.75,23.20,2692.20,500.488,9.61874,1.65338,303.662,307.397,299.487,329.590,225.879,867.598,419.531,503.414,336.035,433.130,64.27,1.71
717886,717886,49.75,23.20,1164.12,491.548,9.61686,1.65324,302.550,301.959,298.045,351.453,308.115,876.591,407.299,502.301,340.844,433.966,64.27,1.71


### Seperating independent and dependent variables

In [3]:
x=dataset.iloc[:,1:-1].values
x

array([[  55.2  ,   16.98 , 3019.53 , ...,  446.37 ,  523.344,   66.91 ],
       [  55.2  ,   16.98 , 3024.41 , ...,  445.922,  498.075,   66.91 ],
       [  55.2  ,   16.98 , 3043.46 , ...,  447.826,  458.567,   66.91 ],
       ...,
       [  49.75 ,   23.2  , 2692.2  , ...,  336.035,  433.13 ,   64.27 ],
       [  49.75 ,   23.2  , 1164.12 , ...,  340.844,  433.966,   64.27 ],
       [  49.75 ,   23.2  , 1164.12 , ...,  374.354,  441.182,   64.27 ]])

In [4]:
x.shape

(717888, 17)

In [5]:
y=dataset.iloc[:,-1].values
y

array([1.31, 1.31, 1.31, ..., 1.71, 1.71, 1.71])

### Splitting Data Into Train And Test

In [6]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=0)

In [7]:
x_train[100]

array([6.40300000e+01, 6.26000000e+00, 3.45998000e+03, 5.87074000e+02,
       9.87521000e+00, 1.71349000e+00, 2.99707000e+02, 2.47866000e+02,
       2.98991000e+02, 2.98704000e+02, 2.98035000e+02, 4.55646000e+02,
       3.51549000e+02, 3.48525000e+02, 2.46531688e+02, 3.54954000e+02,
       6.34100000e+01])

### Feature Scaling

In [8]:
# normalizing using standard scalar
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()

In [9]:
x_train=sc.fit_transform(x_train)
x_test=sc.transform(x_test)

### Training, Testing and Evaluation of Model Using Decision Tree Regressor, Multi Linear Regression and Random Forest Regressor

# Multi Linear Regression:

In [10]:
from sklearn.linear_model import LinearRegression
mr=LinearRegression()

In [11]:
mr.fit(x_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [12]:
y_pred=mr.predict(x_test)
y_pred

array([1.54233892, 1.97925169, 4.69108445, ..., 3.91748569, 1.1824459 ,
       0.8526914 ])

In [13]:
from sklearn.metrics import r2_score
r2_score(y_test,y_pred)

0.6689064889176446

In [14]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
msem = mean_squared_error(y_test,y_pred)

In [15]:
maem = mean_absolute_error(y_test,y_pred)
msem

0.410810395667258

In [16]:
maem

0.493832775699165

# Decision Tree Regressor:

In [17]:
from sklearn.tree import DecisionTreeRegressor
dt=DecisionTreeRegressor(criterion='mse',random_state=0)

In [18]:
dt.fit(x_train,y_train)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=0, splitter='best')

In [19]:
y_pred=dt.predict(x_test)
y_pred

array([1.54      , 1.26      , 4.64220994, ..., 4.24      , 1.09      ,
       1.07      ])

In [20]:
from sklearn.metrics import r2_score
r2_score(y_test,y_pred)

0.994939944055702

In [21]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
msed = mean_squared_error(y_test,y_pred)

In [22]:
maed = mean_absolute_error(y_test,y_pred)
msed

0.00627835797137829

In [23]:
maed

0.00607762508886812

# Random Forest Regressor:

In [24]:
from sklearn.ensemble import RandomForestRegressor
rf=RandomForestRegressor(n_estimators=10,criterion='mse',random_state=0)
rf.fit(x_train,y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=10, n_jobs=None, oob_score=False,
                      random_state=0, verbose=0, warm_start=False)

In [25]:
from joblib import dump
dump(rf,'rfg.save')

['rfg.save']

In [26]:
y_pred=rf.predict(x_test)
y_pred

array([1.54      , 1.26      , 4.7246552 , ..., 4.24000932, 1.09      ,
       1.07      ])

In [27]:
from sklearn.metrics import r2_score
r2_score(y_test,y_pred)

0.9980343919324163

In [28]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
mser = mean_squared_error(y_test,y_pred)

In [29]:
maer = mean_absolute_error(y_test,y_pred)
mser

0.0024388645531925515

In [30]:
maer

0.009222817392457339

In [31]:
x_train[100]

array([ 1.49336977, -1.2300543 ,  0.4687905 ,  1.08505718,  0.26213061,
        0.47162572,  0.66376364, -1.1098551 ,  0.61834672,  0.1856932 ,
        0.22157487, -0.47874596, -0.73353628, -0.90548681, -2.05371114,
       -0.76816414, -1.49000759])

In [32]:
y_train[100]

5.08

In [33]:
z=rf.predict([[1.49336977, -1.2300543 ,  0.4687905 ,  1.08505718,  0.26213061, 0.47162572,  0.66376364, -1.1098551 ,  0.61834672,  0.1856932 , 0.22157487, -0.47874596, -0.73353628, -0.90548681, -2.05371114,  -0.76816414, -1.49000759]])
z

array([5.08])

In [34]:
import pickle
#saving the model
filename  = 'qualityPrediction.pkl'
scalerFile= 'scaler.pkl'
pickle.dump(rf, open(filename, 'wb'))
pickle.dump(sc, open(scalerFile, 'wb'))

# Observation:

|Algorithm                |Accuracy             |Accuracy in %
|-------------------------|---------------------|--------------
|Multi Linear Regression  | 0.6689064889176446  | 66.8%
|Decision Tree Regressor  | 0.994939944055702   | 99.4%
|Random Forest Regressor  | 0.9980343919324163  | 99.8%