# Model Training

In [2]:
!pip show xgboost


Name: xgboost
Version: 3.1.1
Summary: XGBoost Python Package
Home-page: 
Author: 
Author-email: Hyunsu Cho <chohyu01@cs.washington.edu>, Jiaming Yuan <jm.yuan@outlook.com>
License: Apache-2.0
Location: C:\Users\turla\anaconda3\Lib\site-packages
Requires: numpy, scipy
Required-by: 


## 1) importing libraries

In [3]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
# Modelling
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import warnings

## 2) Import the csv_data

In [17]:
md = pd.read_csv('sensor_data.csv')

In [18]:
md.head()

Unnamed: 0,temp,vibration,pressure,failure
0,79.967142,0.364964,25.682532,0
1,73.617357,0.471096,29.843983,0
2,81.476885,0.341516,30.090084,0
3,90.230299,0.438408,32.363152,0
4,72.658466,0.121277,23.165708,0


## 3) creating x and y variable

In [19]:
X = md.drop(columns=['failure'],axis=1)
y = md['failure']

In [20]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   temp       2000 non-null   float64
 1   vibration  2000 non-null   float64
 2   pressure   2000 non-null   float64
dtypes: float64(3)
memory usage: 47.0 KB


### Inference
As we don't have any object data type there is no need to apply any feature engineering

## 4) train_test_split

In [10]:
from sklearn.model_selection import train_test_split

In [21]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42)

## 5)Creating an evaluate function to find out precision of the models

In [None]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

### 5.1) training the model

In [23]:
from sklearn.ensemble import RandomForestClassifier

In [25]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(), 
    "CatBoosting Regressor": CatBoostRegressor(verbose=False),
    "AdaBoost Regressor": AdaBoostRegressor(),
    "Random Forest Regressor": RandomForestRegressor()
}
model_list = []
r2_list =[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)

    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))
    

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    r2_list.append(model_test_r2)
    
    print('='*35)
    print('\n')

Linear Regression
Model performance for Training set
- Root Mean Squared Error: 0.2850
- Mean Absolute Error: 0.2050
- R2 Score: 0.2422
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 0.2931
- Mean Absolute Error: 0.2128
- R2 Score: 0.2643


Lasso
Model performance for Training set
- Root Mean Squared Error: 0.3275
- Mean Absolute Error: 0.2144
- R2 Score: 0.0000
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 0.3420
- Mean Absolute Error: 0.2242
- R2 Score: -0.0014


Ridge
Model performance for Training set
- Root Mean Squared Error: 0.2851
- Mean Absolute Error: 0.2043
- R2 Score: 0.2422
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 0.2933
- Mean Absolute Error: 0.2124
- R2 Score: 0.2635


K-Neighbors Regressor
Model performance for Training set
- Root Mean Squared Error: 0.2569
- Mean Absolute Error: 0.1300
- R2 Score: 0.3845
----------------------

In [27]:
m = pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2_Score']).sort_values(by=["R2_Score"],ascending=False)

In [28]:
m

Unnamed: 0,Model Name,R2_Score
8,AdaBoost Regressor,0.722335
5,Random Forest Regressor,0.666057
7,CatBoosting Regressor,0.634743
6,XGBRegressor,0.600159
4,Decision Tree,0.400557
0,Linear Regression,0.264332
2,Ridge,0.263489
3,K-Neighbors Regressor,0.063156
1,Lasso,-0.001416


In [30]:
adaboost = AdaBoostRegressor()
adaboost.fit(X,y)
a_y_pred = adaboost.predict(X_test)
print(r2_score(y_test,a_y_pred))

0.7476415084535654


In [32]:
model = RandomForestRegressor().fit(X,y)
m_y_pred = model.predict(X_test)
print(f'Random forest r2_score: {r2_score(y_test,m_y_pred)}')

Random forest r2_score: 0.9581831156782987


### hence RFR is the best model

## saving the best model

In [33]:
import joblib
joblib.dump(model, "model.pkl")

['model.pkl']