# 04_6 OLS Model

Due to NDA agreements no data can be displayed.

In [None]:
import pandas as pd 
import numpy as np

from sklearn.metrics import mean_squared_error 
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

import statsmodels.api as sm

import seaborn as sns
import matplotlib.pyplot as plt

import sys
sys.path.append("..")
import mlflow
from modeling.config import EXPERIMENT_NAME
TRACKING_URI = open("../.mlflow_uri").read().strip()

In [None]:
# read data
df = pd.read_csv('../data/Featureselection03.csv')
df.head()

So that everyone is on track with the feature selection, we created another csv file to rate the importance and only use important features for training our models and further analysis.

In [None]:
# read list with feature importance
data_log = pd.read_csv('../data/Capstone_features_Features.csv')
data_log.head()

### Create data frame with important features

Only Features with a feature importance smaller than 3 were selected.

In [None]:
# create list of important features (feature importance < 3)
list_imp_feat = list(data_log[data_log['ModelImportance'] < 3]['VarName'])
len(list_imp_feat)

In [None]:
df_model = df[list_imp_feat].copy()

In [None]:
df_model.info()

### Fill and drop NaN

Values for V.SLPOG.act.PRC and ME.SFCI.act.gPkWh cointain missing values. In the EDA it was observable that it makes sense to put 0 in these places.

In [None]:
df_model['V.SLPOG.act.PRC'].fillna(0,inplace=True)
df_model['ME.SFCI.act.gPkWh'].fillna(0,inplace=True)

All other rows with missing values were dropped.

In [None]:
df_model.dropna(inplace=True)

In [None]:
df_model.info()

### Check correlations

In [None]:
plt.figure(figsize = (30,28))
sns.heatmap(df_model.corr(), annot = True, cmap = 'RdYlGn')

V.SOG.act.kn is still highly correlated with the target, but this feature is necessary to keep.

### Define Target

The goal of this project was to create a tool to help optimizing fuel efficiency. Therefore the target was defined as the ME.FMS.act.tPh. All other features were used for the prediction.

In [None]:
X = df_model.drop(['ME.FMS.act.tPh'], axis = 1)
y = df_model['ME.FMS.act.tPh']

### Train Test Split

The train test split was performed with a test size of 10%, due to the high amount of data. Due to the fact that passages in Europe were less common, compared to the other two, the split was performed by stratifying on the passage type.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = X['passage_type'], test_size = 0.1, random_state = 42)

### Create dummy values for feature 'passage_type'

The passage type was the only remaining feature of the object type. Therefore it has to be transformed into dummy values.

In [None]:
X_train = pd.get_dummies(X_train, drop_first=True)
X_test = pd.get_dummies(X_test, drop_first=True)

### Add constant

For the OLS model an constant for the y axis section needs to be added.

In [None]:
X_train = sm.add_constant(X_train)
X_test = sm.add_constant(X_test)

### Set MLFlow connection

For everybody being up to date about the model performance and to track model enhancements as well as keeping track of the models, MLFlow was used.

In [None]:
# setting the MLFlow connection and experiment
mlflow.set_tracking_uri(TRACKING_URI)
mlflow.set_experiment(EXPERIMENT_NAME)
mlflow.start_run(run_name='OLS_unscaled') # CHANGE!
run = mlflow.active_run()

## Modelling

In [None]:
X_train.head()

Linear regression can be done with and without Feature scaling.

In [None]:
#X_train_scaled = scaler.fit_transform(X_train)
#X_test_scaled = scaler.transform(X_test)

Setting up the OLS model. In the OLS model the formular for a linear regression is being solved. Therefore we can afterwards have a look at the coefficients to understand feature importance.

In [None]:
lin_reg = sm.OLS(y_train, X_train)

Training the OLS model.

In [None]:
model = lin_reg.fit()

In [None]:
model.summary()

Performing model predictions.

In [None]:
y_pred = model.predict(X_test)
y_pred_train = model.predict(X_train)

## Analysis

As a metric to analyse and compare the models the RMSE (root mean squared error) was used for the regression problem.

In [None]:
print('RMSE train: ', mean_squared_error(y_train, y_pred_train, squared= False))
rmse_train = mean_squared_error(y_train, y_pred_train, squared= False)
print('RMSE test: ', mean_squared_error(y_test, y_pred, squared= False))
rmse_test = mean_squared_error(y_test, y_pred, squared= False)

In [None]:
fig=plt.figure(figsize=(6, 6))
plt.axline([1, 1], [2, 2],color='lightgrey')
plt.scatter(y_train, y_pred_train, color ='#33424F')
plt.scatter(y_test, y_pred, color = '#FF6600')
#plt.xticks(np.arange(0,501,100));
#plt.yticks(np.arange(0,501,100));
plt.xlabel("ME.FMS.act.tPh actual");
plt.ylabel("ME.FMS.act.tPh predicted");
plt.xlim(-2, 8);
plt.ylim(-2, 8);

Here it can be seen, that especially while trying to predict lower values for the target the model makes some errors in both directions. For higher values the model underpredicts.

### Write to MLFlow

In [None]:
#seting parameters that should be logged on MLFlow
#these parameters were used in feature engineering (inputing missing values)
#or parameters of the model (fit_intercept for Linear Regression model)
params = {
      "features drop": 'according to Capstone_features_Features.csv',
      "explanation": 'OLS unscaled',
      "csv used": 'Featureselection03.csv',
      "NaN handling": 'V.SLPOG.act.PRC and ME.SFCI.act.gPkWh filled with 0, rest dropped by row',
      'Shape' : df.shape,
      'Scaler' : ''
  }

In [None]:
#logging params to mlflow
mlflow.log_params(params)
#setting tags
mlflow.set_tag("running_from_jupyter", "True")
#logging metrics
mlflow.log_metric("train-" + "RMSE", rmse_train)
mlflow.log_metric("test-" + "RMSE", rmse_test)
# logging the model to mlflow will not work without a AWS Connection setup.. too complex for now
# but possible if running mlflow locally
# mlflow.log_artifact("../models")
# mlflow.sklearn.log_model(reg, "model")
mlflow.end_run()

### What are the most important features?

In [None]:
model.params