In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
import pickle


# the custom scaler
class CustomScaler(BaseEstimator, TransformerMixin):

    def __init__(self, columns, copy=True, with_mean=True, with_std=True):
        self.scaler = StandardScaler(copy=copy, with_mean=with_mean, with_std=with_std)
        self.columns = columns
        self.mean_ = None
        self.var_ = None

    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self

    def transform(self, X):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        X_not_scaled = X.loc[:, ~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]
    

    #create the special class to be used for making prediction on new data
    class absenteeism_model():
        def __init__(self, model_file, sclaer_file):
            """read the model and scaler file that was saved"""
            with open('model', 'rb') as model_file, open('scaler', 'rb') as scaler_file:
                self.linreg = pickle.load(model_file)
                self.scaler = pickle.load(scaler_file)
                self.data = None

        def load_and_clean_data(self, data_file):

            #import the data
            df = pd.read_csv(data_file, delimeter=',')
             #store the data in a new variable
            self.df_with_predictions = df.copy()
            # drop the 'ID' columns
            df = df.drop(['ID'], axis = 1)
            # to preserve the code created in the prior section, let add column with NaN strings
            df['Absenteeism Time in Hours'] = 'NaN'

            #create a seperate dataframe, containing dummy variable for all available reasons
            reason_columns = pd.get_dummies(df['Reason for Absence'], drop_first = True)

            #split reason_columns into 4 types
            reason_type_1= reason_columns.loc[:, 1:14].max(axis=1)
            reason_type_2= reason_columns.loc[:, 15:17].max(axis=1)
            reason_type_3= reason_columns.loc[:, 18:21].max(axis=1)
            reason_type_4 = reason_columns.loc[:, 22:].max(axis=1)

            #assign names to the 4 reason type columns
            column_name = ['Date', 'Transportation Expense', 'Distance to Work', 'Age','Daily Work Load Average', 'Body Mass Index', 'Education','Children', 'Pets', 'Absenteeism Time in Hours']

            df.columns = column_name

            #reorder the columns in df 
            column_names_reordered = ['Date', 'Transportation Expense', 'Distance to Work', 'Age','Daily Work Load Average', 'Body Mass Index', 'Education','Children', 'Pets', 'Absenteeism Time in Hours']
            df = df[column_names_reordered]

            #convert the Date column into datetime
            df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y')

            # create a list to store the month value from the date
            list_months=[]

            for i in range(df.shape[0]):
                list_months.append(df['Date'][i].month)

            #insert month value into new column
            df['Months value'] = list_months

            #creating new feature called day of the week
            df['Day_of_the_week'] = df['Date'].apply(lambda x: x.weekday())

            #drop the Date column
            df = df.drop(['Date'], axis=1)

            #re order the columns from df
            column_names_upgrade = ['reason_1', 'reason_2', 'reason_3', 'reason_4','Month value',
                                            'Transportation Expense', 'Distance to Work', 'Age',
                                            'Daily Work Load Average', 'Body Mass Index', 'Education',
                                            'Children', 'Pets', 'Absenteeism Time in Hours', 'Day of the week']
            
            df = df[column_names_upgrade]

            #map education variable
            df['Education'] = df['Education'].map({1:0, 2:1, 3:1, 4:1})

            #replace the Nan values
            df = df.fillna(value=0)

            #dropping the original Absenteeism Time column
            df = df.drop(['Absenteeism Time in Hours'], axis=1)

            #dropping the unimportant column
            df = df.drop([ 'Distance to Work','Day of the week', 'Daily Work Load Average'], axis=1)

            #Including this line of code to call preprocessed data
            self.preprocessed_data = df.copy()

            #Including this line of code for the next function
            self.data = self.scaler.transform(df)
        # predicting the probabillity of thedata point to be one
        def predicted_probability(self):
            if(self.data is not None):
                pred =self.linreg.predict_proba(self.data)[:, 1]
                return pred
        # function outputs 0 or 1 based on the model
        def predicted_outcome_category(self):
            if(self.data is not None):
                pred_outcome =self.linreg.predict(self.data)[:, 1]
                return pred_outcome
        # predict the outputs and probabilities and add columns with the values at the end of the new data    
        def predicted_outputs(self):
            if(self.data is not None):
                self.preprocessed_data['Probability'] = self.linreg.predict_proba(self.data)[:, 1]
                self.preprocessed_data['Prediction'] = self.linreg.predict(self.data)
                return self.preprocessed_data


