In [2]:
import numpy as np
import pandas as pd
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin

In [3]:
class CustomScaler(BaseEstimator, TransformerMixin):
    
    def __init__(self, columns):
        self.scaler = StandardScaler()
        self.columns = columns
        self.mean_ = None
        self.var_ = None
        
    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns],y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
        
    def transform (self, X, y=None, copy=None):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]),columns=self.columns)
        X_not_scaled = X.loc[:, ~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]
    
    
    #creating special class to use to predict new data
    
class absenteeism_model():
        
        def __init__(self,model_file, scaler_file):
             #reading the model and scaler file 
            with open('model','rb') as model_file, open('scaler','rb') as scaler_file:
                self.reg = pickle.load(model_file)
                self.scaler = pickle.load(scaler_file)
                self.data = None
                
        #take a data file (*.csv) and preprocess it in the same way like the absenteeism file
        
        def load_and_clean_data (self, data_file):
            #import data
            df = pd.read_csv(data_file, delimeter=',')
            #store data in a new variable for later use 
            self.df_with_predictions = df.copy()
            #drop id
            df = df.drop(['ID'], axis=1)
            #to preserve the code we created in the previous section
            df['Absenteeism Time in Hours'] = 'NaN'
            
            #creating a seperate dataframe, containing dummy values for all reasons
            reason_columns = pd.get_dummies(df['Reason for Absence'],drop_first = True)
            
            #split reason columns
            reason_type_1 = reason_columns.loc[:,1:14].max(axis=1)
            reason_type_2 = reason_columns.loc[:,15:17].max(axis=1)
            reason_type_3 = reason_columns.loc[:,18:21].max(axis=1)
            reason_type_4 = reason_columns.loc[:,12:17].max(axis=1)
            
            
            #to avoid multicollineraity 
            df = df.drop(['Reason for Absence'], axis=1)
            
            #concatenate 4 reasons and df
            df = pd.concat([df, reason_type_1, reason_type_2, reason_type_3, reason_type_4], axis=1)
            
            #assign names to reason columns
            column_names = ['Date','Transportation Expense','Distance to Work', 'Age','Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours', 'reason_1', 'reason_2', 'reason_3', 'reason_4']
            
            df.columns = column_names
            
            #reorder columns in df
            
            column_names_reordered = ['reason_1', 'reason_2', 'reason_3', 'reason_4','Date','Transportation Expense','Distance to Work', 'Age','Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours']
            
            df = df[column_names_reordered]
            
            df['Date']=pd.to_datetime(df['Date'],format='%d/%m/%Y')
            
            list_months=[]
            for i in range (df.shape[0]):
                list_months.append(df['Date'][i].month)
                
                df['Month Value']=list_months
                
                df['Day of the Week']= df['Date'].apply(lambda x: x.weekday())
                
                df = df.drop(['Date'], axis=1)
                
                column_names_update = ['reason_1', 'reason_2', 'reason_3', 'reason_4' , 'Month Value',
       'Day of the week','Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours']
                
                df=df[column_names_update]
                
                df['Education'] = df['Education'].map({1:0, 2:1, 3:1, 4:1})
                
                df = df.fillna(value=0)
                
                df = df.drop(['Absenteeism Time in Hours'], axis=1)
                
                df= df.drop(['Day of the Week', 'Daily Work Load Average', 'Distance to Work'], axis=1)
                
                self.preprocessed_data = df.copy()
                
                self.data = self.scaler.transform(df)
                
            def predicted_probability(self):
                    if (self.data is not None):
                        pred = self.reg.predict_proba(self.data)[:,1]
                        return pred
                    
            def predicted_output_category(self):
                    if (self.data is not None):
                        pred_outputs = self.reg.predict(self.date)
                        return pred_outputs

                    
            def predicted_outputs(self):
                    if (self.data is not None):
                        self.preprocessed_data['Probability']= self.reg.predict_proba(self.data)[:,1]
                        self.preprocessed_data['Prediction']= self.reg.predict(self.data)
                        return self.preprocessed_data 
            
