In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
import os
import numpy as np
import pandas as pd
import seaborn as sns 
from pandas.api.types import is_numeric_dtype

In [2]:
path= '../../datasets/main_data/bank-additional-full.csv'
full_bank = pd.read_csv(path, sep=';')

#### Module Data.py codes

In [182]:
#%%writefile data.py
# %%writefile ../scripts/data.py

import seaborn as sns
import matplotlib.pyplot as plt 
import os
import numpy as  np
import pandas as pd
import pandas
from pandas.api.types import is_numeric_dtype
from sklearn.ensemble import IsolationForest
import os
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, MultiLabelBinarizer
from sklearn.base import BaseEstimator, TransformerMixin


def load_data(path="", sep=",", cols_to_drop=[]):
            
        try :
            data = pd.read_csv(path, sep)
            
            if len(cols_to_drop) > 0:
                for col in cols_to_drop:
                    data.drop(col, axis=1, inplace=True)

            return data 
        
        except:
            
            "No data path was passed upon inastantiation of object"
    


# define class Preprocess to preprocess data
# class Preprocess inherits from BaseEstimator & TransformerMixin
# the idea behind the Preprocess class is to preprocess our data ready for modelling

class Preprocessor(BaseEstimator, TransformerMixin):
    
    def __repr__(self):
        
        return "Used to prepare data for modelling"
    
    def  __init__(self):
        
        pass
        
#         self.path = path
#         self.cols_to_drop = cols_to_drop 
#         self.sep = sep 
        
        
    
    def fit(self, data, y=None):
        
        assert(type(data) is pandas.core.frame.DataFrame), "data must be of type pandas.DataFrame"
        
        self.data = data 
        
        print("Fitted")
        
        return self 
        


    def check_outliers(self, show_plot=False, save_img=os.getcwd()+'/outliers.png'):
            
        """
        This functions checks for columns with outlers using the IQR method

        It accespts as argmuent a dataset. 
        show_plot can be set to True to output pairplots of outlier columns    
        """

        self.outliers = [] 
        Q1 = self.data.quantile(0.25)  
        Q3 = self.data.quantile(0.75)
        IQR = Q3 - Q1
        num_data = self.data.select_dtypes(include='number')
        result = dict ((((num_data < (Q1 - 1.5 * IQR)) | (num_data > (Q3 + 1.5 * IQR)))==True).any())
        #data[(data[col] >= high)|(data[col] <= low)].index
        index = self.data[(num_data < Q1 - 1.5 * IQR) | (num_data > Q3 + 1.5 * IQR)].index
        for k,v in result.items():
            if v == True:  
                self.outliers.append(k)
        if show_plot:
            self.outlier_pair_plot = sns.pairplot(self.data[self.outliers]);
            print(f'{result},\n\n Visualization of outlier columns')
            plt.savefig(fname=save_img, format='png')
            return  self.outlier_pair_plot
        else:
            return self.data.loc[index, self.outliers] 
        
        
    def treat_outliers(self, type_='median_replace'):
            
        """
        This treat outliers using any ofthses 3 methods as specified by user

            1. median_replace -  median replacement

            2. quant_floor - quantile flooring

            3. trim - trimming 

            4. log_transform - log transformations

        The methods are some of the commont statistical methods in treating outler
        columns

        By default treatment type is set to median replacement

        """

        if type_ == "median_replace":

            for col in self.data.columns.tolist():
                if is_numeric_dtype(self.data[col]):
                    median = (self.data[col].quantile(0.50))
                    q1 = self.data[col].quantile(0.25)
                    q3 = self.data[col].quantile(0.75)
                    iqr = q3 - q1
                    high = int(q3 + 1.5 * iqr) 
                    low = int(q1 - 1.5 * iqr)
                    self.data[col] = np.where(self.data[col] > high, median, self.data[col])
                    self.data[col] = np.where(self.data[col] > high, median, self.data[col])        

        if type_ == "quant_floor":

            for col in self.data.columns.tolist():
                if is_numeric_dtype(data[col]):
                    q_10 = self.data[col].quantile(0.5)
                    q_90 = self.data[col].quantile(0.95)
                    self.data[col] =  self.data[col] = np.where(self.data[col] < q_10, q_10 , self.data[col])
                    self.data[col] =  self.data[col] = np.where(self.data[col] > q_90, q_90 , self.data[col])

        if type_ == "trim": 

            for col in self.data.columns.tolist():
                low = .05
                high = .95
                quant_df = self.data.quantile([low, high])
                for name in list(self.data.columns):
                    if is_numeric_dtype(self.data[name]):
                        self.data = self.data[(self.data[name] >= quant_df.loc[low, name]) 
                            & (self.data[name] <= quant_df.loc[high, name])]

        if type_ == "log_transform":  
            for col in self.data.columns.tolist():
                if is_numeric_dtype(self.data[col]):
                    self.data[col] = self.data[col].map(lambda i: np.log(i) if i > 0 else 0)

        if type_ == "isf":
            iso = IsolationForest(contamination=0.1)
            yhat = iso.fit_predict(self.data.select_dtypes(exclude='object'))
            #select all rows that are not outliers
            mask = yhat != -1 
            self.data = self.data[mask]


        return self.data 
    
    
    def map_col_values(self, col_name="", values_dict={}):

        self.data[col_name] = self.data[col_name].map(values_dict)

        return self.data
    
    
    def split_data_single(self, target_cols=[]):
            
        self.features = self.data.drop(columns=target_cols, axis=1) 

        self.target   = pd.DataFrame(self.data[target_cols])

        return self.features, self.target
    
    
    def encode (self, data_obj=None, use_features=True, use_target=False): 
        
        if data_obj is None and use_features == False and use_target == False:
        
            ohe = OneHotEncoder(sparse=False, handle_unknown='ignore', )
            to_encode = self.data.select_dtypes(exclude='number')
            if self.data.shape[1] > 1:
                #ohe = MultiLabelBinarizer()
                self.data.drop(to_encode.columns.tolist(), axis=1, inplace = True)
                features_cat_encode = pd.DataFrame(ohe.fit_transform(to_encode))
                self.data = self.data.merge(features_cat_encode, left_index=True, right_index=True)
               # print(ohe.classes_) 
            else: 
                self.data = pd.DataFrame(ohe.fit_transform(to_encode))
                print(ohe.categories_) 
            return self.data
        
        if data_obj is not None:
        
            self.data_obj = data_obj
            print("Not None")
            
            ohe = OneHotEncoder(sparse=False, handle_unknown='ignore', )
            to_encode = self.data_obj.select_dtypes(exclude='number')
            if self.data_obj.shape[1] > 1:
                #ohe = MultiLabelBinarizer()
                self.data_obj.drop(to_encode.columns.tolist(), axis=1, inplace = True)
                features_cat_encode = pd.DataFrame(ohe.fit_transform(to_encode))
                self.data_obj = self.data_obj.merge(features_cat_encode, left_index=True, right_index=True)
               # print(ohe.classes_) 
            else:
                self.data_obj = pd.DataFrame(ohe.fit_transform(to_encode))
                print(ohe.categories_) 
            return self.data_obj
        
        if use_features:
            ohe = OneHotEncoder(sparse=False, handle_unknown='ignore', )
            to_encode = self.features.select_dtypes(exclude='number')
            if self.features.shape[1] > 1:
                #ohe = MultiLabelBinarizer()
                self.features.drop(to_encode.columns.tolist(), axis=1, inplace = True)
                features_cat_encode = pd.DataFrame(ohe.fit_transform(to_encode))
                self.features = self.features.merge(features_cat_encode, left_index=True, right_index=True)
               # print(ohe.classes_) 
            else: 
                self.features = pd.DataFrame(ohe.fit_transform(to_encode))
                print(ohe.categories_) 
            return self.features
        
        if use_target:
            
            ohe = OneHotEncoder(sparse=False, handle_unknown='ignore', )
            to_encode = self.target.select_dtypes(exclude='number')
            if self.target.shape[1] > 1:
                #ohe = MultiLabelBinarizer()
                self.target.drop(to_encode.columns.tolist(), axis=1, inplace = True)
                features_cat_encode = pd.DataFrame(ohe.fit_transform(to_encode))
                self.target = self.target.merge(features_cat_encode, left_index=True, right_index=True)
               # print(ohe.classes_) 
            else: 
                self.target = pd.DataFrame(ohe.fit_transform(to_encode))
                print(ohe.categories_) 
            return self.target
            
    
    def split_data_double(self, features_=pd.DataFrame([[]]), target_=pd.DataFrame([[]]), 
                          test_size=.10, use_native=True):
        
        if use_native == False:
        
            if features.shape[0] != target.shape[0]:
                
                raise Exception("Wrong, you are trying to pass unequal shapes\n\
                Shapes of dataframes must be equal\n\
                Try target = target.iloc[0:features.shape[0]]")

            self.features_ = features_
            self.target_ = target_

            self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.features_, 
                                                                                    self.target_,
                                           test_size= test_size, random_state=24)

            return self.X_train, self.X_test, self.y_train, self.y_test
        
        if use_native:
            
            self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.features, self.target,
                                           test_size= test_size, random_state=24)

            return self.X_train, self.X_test, self.y_train, self.y_test
        
    
    
    


    def scale_data(self, scale_data=pd.DataFrame([[]]),
                   scaler=RobustScaler(), use_features=True,
                  use_target=False, use_data=False):
        
        """
            Specify scaler type, scaler type must have fit_transform as a method
        """
        
        if use_features:
    
            self.features = scaler.fit_transform(self.features)

            return self.features
        
        if use_target:
            
            self.target = scaler.fit_transform(self.target)

            return self.target
        
        if use_data:
            
            self.data = scaler.fit_transform(self.data)

            return self.data
        
        if use_data == False and use_features == False and use_target == False:
            
            self.scale_data = scale_data
            
            self.scale_data = scaler.fit_transform(self.scale_data)

            return self.scale_data
            
            
        
        
        
            
            
            
    
    
    def transform(self, X):
        
        """
        Ideally, a preapred trainX data ought to be passed to in case of passing into a pipeline
        """
        
        self.data = X
                
        self.data = self.treat_outliers(type_="isf") 
        
        #self.data = self.map_col_values(col_name="y", values_dict={"no":0, "yes":1})
        
       # self.features, self.target = self.split_data_single(target_cols=["y"])
        #print(self.features)
                
        self.features = self.encode(self.features)
      #  self.target = self.target.iloc[0:self.features.shape[0], 0:]
        #print(self.target)
       # self.X_train, self.X_test, self.y_train, self.y_test = self.split_data_double(
        #    self.features, self.target, test_size=.10)
        
        scaler=RobustScaler() 
            
        X = scaler.fit_transform(self.X_train)
        
        return X
    
    
    def fit_transform(self, X, y=None):
        
        self.X = X
        
        return self.transform(self.X) 

### Test data.py module codes

In [183]:
p = Preprocessor() 

In [184]:
data = load_data(path, sep=';') 

In [185]:
data.head(2)

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [186]:
p.fit(data)

Fitted


Used to prepare data for modelling

In [187]:
data.head(2)

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [188]:
p.check_outliers()

Unnamed: 0,age,duration,campaign,pdays,previous,cons.conf.idx
0,56,261,1,999,0,-36.4
1,57,149,1,999,0,-36.4
2,37,226,1,999,0,-36.4
3,40,151,1,999,0,-36.4
4,56,307,1,999,0,-36.4
...,...,...,...,...,...,...
41183,73,334,1,999,0,-50.8
41184,46,383,1,999,0,-50.8
41185,56,189,2,999,0,-50.8
41186,44,442,1,999,0,-50.8


In [189]:
treated_data = p.treat_outliers(type_='isf')

In [190]:
 treated_data

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40742,34,admin.,married,high.school,no,no,yes,cellular,sep,wed,...,2,999,0,nonexistent,-1.1,94.199,-37.5,0.876,4963.6,no
40765,34,technician,single,university.degree,no,no,no,cellular,sep,fri,...,2,999,0,nonexistent,-1.1,94.199,-37.5,0.879,4963.6,no
40781,33,admin.,married,university.degree,no,no,no,cellular,sep,wed,...,2,999,0,nonexistent,-1.1,94.199,-37.5,0.879,4963.6,yes
40805,33,admin.,married,university.degree,no,yes,no,cellular,sep,thu,...,2,999,0,nonexistent,-1.1,94.199,-37.5,0.878,4963.6,yes


In [191]:
p.map_col_values(col_name='y', values_dict={'yes':1, 'no':0})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40742,34,admin.,married,high.school,no,no,yes,cellular,sep,wed,...,2,999,0,nonexistent,-1.1,94.199,-37.5,0.876,4963.6,0
40765,34,technician,single,university.degree,no,no,no,cellular,sep,fri,...,2,999,0,nonexistent,-1.1,94.199,-37.5,0.879,4963.6,0
40781,33,admin.,married,university.degree,no,no,no,cellular,sep,wed,...,2,999,0,nonexistent,-1.1,94.199,-37.5,0.879,4963.6,1
40805,33,admin.,married,university.degree,no,yes,no,cellular,sep,thu,...,2,999,0,nonexistent,-1.1,94.199,-37.5,0.878,4963.6,1


In [192]:
p.data

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40742,34,admin.,married,high.school,no,no,yes,cellular,sep,wed,...,2,999,0,nonexistent,-1.1,94.199,-37.5,0.876,4963.6,0
40765,34,technician,single,university.degree,no,no,no,cellular,sep,fri,...,2,999,0,nonexistent,-1.1,94.199,-37.5,0.879,4963.6,0
40781,33,admin.,married,university.degree,no,no,no,cellular,sep,wed,...,2,999,0,nonexistent,-1.1,94.199,-37.5,0.879,4963.6,1
40805,33,admin.,married,university.degree,no,yes,no,cellular,sep,thu,...,2,999,0,nonexistent,-1.1,94.199,-37.5,0.878,4963.6,1


In [193]:
p.data.rename(columns={'y':'purchases'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [194]:
p.data

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,purchases
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40742,34,admin.,married,high.school,no,no,yes,cellular,sep,wed,...,2,999,0,nonexistent,-1.1,94.199,-37.5,0.876,4963.6,0
40765,34,technician,single,university.degree,no,no,no,cellular,sep,fri,...,2,999,0,nonexistent,-1.1,94.199,-37.5,0.879,4963.6,0
40781,33,admin.,married,university.degree,no,no,no,cellular,sep,wed,...,2,999,0,nonexistent,-1.1,94.199,-37.5,0.879,4963.6,1
40805,33,admin.,married,university.degree,no,yes,no,cellular,sep,thu,...,2,999,0,nonexistent,-1.1,94.199,-37.5,0.878,4963.6,1


In [195]:
p.split_data_single(target_cols=['purchases'])

(       age         job  marital          education  default housing loan  \
 0       56   housemaid  married           basic.4y       no      no   no   
 1       57    services  married        high.school  unknown      no   no   
 2       37    services  married        high.school       no     yes   no   
 3       40      admin.  married           basic.6y       no      no   no   
 4       56    services  married        high.school       no      no  yes   
 ...    ...         ...      ...                ...      ...     ...  ...   
 40742   34      admin.  married        high.school       no      no  yes   
 40765   34  technician   single  university.degree       no      no   no   
 40781   33      admin.  married  university.degree       no      no   no   
 40805   33      admin.  married  university.degree       no     yes   no   
 40809   33      admin.  married  university.degree       no     yes   no   
 
          contact month day_of_week  duration  campaign  pdays  previous  

In [196]:
p.features

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,261,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,149,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0
2,37,services,married,high.school,no,yes,no,telephone,may,mon,226,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,151,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0
4,56,services,married,high.school,no,no,yes,telephone,may,mon,307,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40742,34,admin.,married,high.school,no,no,yes,cellular,sep,wed,214,2,999,0,nonexistent,-1.1,94.199,-37.5,0.876,4963.6
40765,34,technician,single,university.degree,no,no,no,cellular,sep,fri,152,2,999,0,nonexistent,-1.1,94.199,-37.5,0.879,4963.6
40781,33,admin.,married,university.degree,no,no,no,cellular,sep,wed,208,2,999,0,nonexistent,-1.1,94.199,-37.5,0.879,4963.6
40805,33,admin.,married,university.degree,no,yes,no,cellular,sep,thu,272,2,999,0,nonexistent,-1.1,94.199,-37.5,0.878,4963.6


In [197]:
p.target

Unnamed: 0,purchases
0,0
1,0
2,0
3,0
4,0
...,...
40742,0
40765,0
40781,1
40805,1


In [198]:
p.encode()

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,...,43,44,45,46,47,48,49,50,51,52
0,56,261,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,57,149,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,37,226,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,40,151,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,56,307,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37062,43,281,1,999,0,-2.9,92.469,-33.6,1.029,5076.2,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
37063,42,72,1,999,0,-2.9,92.469,-33.6,1.029,5076.2,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
37064,42,122,1,999,0,-2.9,92.469,-33.6,1.029,5076.2,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
37065,32,72,1,999,0,-2.9,92.469,-33.6,1.029,5076.2,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [199]:
p.features

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,...,43,44,45,46,47,48,49,50,51,52
0,56,261,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,57,149,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,37,226,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,40,151,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,56,307,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37062,43,281,1,999,0,-2.9,92.469,-33.6,1.029,5076.2,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
37063,42,72,1,999,0,-2.9,92.469,-33.6,1.029,5076.2,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
37064,42,122,1,999,0,-2.9,92.469,-33.6,1.029,5076.2,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
37065,32,72,1,999,0,-2.9,92.469,-33.6,1.029,5076.2,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [None]:
p.split_data_double() 

In [115]:
from sklearn.pipeline import FeatureUnion, Pipeline  
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline

In [22]:
from sklearn.decomposition import PCA

In [23]:
fu = Pipeline([("process", Preprocessor()), ("pca", PCA(n_components=45))
                  ])

In [18]:
fu.fit(data) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Pipeline(steps=[('process', Preprocessor()), ('pca', PCA(n_components=45))])

In [24]:
fu.fit_transform(data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


array([[-2.34713054e+00, -1.15792356e+00, -3.51087304e-01, ...,
        -9.99429445e-02,  1.23065119e-02, -1.74090288e-03],
       [-2.34949504e+00, -1.72360001e+00, -5.00106619e-01, ...,
        -2.47896566e-02,  7.86031098e-04,  4.84544891e-04],
       [-2.34708831e+00, -1.14146684e+00, -5.35325365e-01, ...,
        -3.24254356e-03,  6.01530426e-03, -3.13036517e-03],
       ...,
       [-2.34891350e+00, -1.60417013e+00, -9.62050753e-01, ...,
        -2.95414847e-03, -9.81316703e-04, -4.93331673e-04],
       [-2.34966801e+00, -1.70123082e+00,  6.47114622e-01, ...,
        -1.21677843e-03,  7.06689978e-04, -2.69115175e-03],
       [-2.34742625e+00, -1.18674843e+00, -1.02362369e-01, ...,
        -3.44592487e-03,  5.19998228e-03, -6.95102712e-04]])

In [20]:
fu.transform(data)  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


array([[-1.58949551e-01, -1.86494902e+00, -1.82183133e+00, ...,
         3.30512032e-02, -1.73905557e-03, -6.13112738e-03],
       [-1.60497482e-01, -1.79684731e+00,  1.03446088e+00, ...,
        -3.86445477e-03, -8.79734370e-04,  4.71183478e-04],
       [-1.33437457e-01,  4.09130257e+00,  2.73888830e-01, ...,
        -7.54461415e-03,  4.10748791e-02,  1.22315779e-03],
       ...,
       [-1.58296948e-01, -1.62969611e+00, -9.24110433e-01, ...,
         1.66798780e-03, -1.61178731e-03, -8.71970900e-05],
       [-1.58007310e-01, -1.56585081e+00, -7.05530799e-01, ...,
        -5.22592293e-03,  5.35685538e-03, -1.11982302e-05],
       [-1.56916119e-01, -1.16520232e+00, -3.68814475e-01, ...,
        -7.64868422e-03, -7.87634584e-03,  5.79650113e-04]])

In [26]:
from imblearn.over_sampling import SMOTE

In [32]:
 smt = SMOTE(sampling_strategy=.7)

#### Module plot.py codes

In [142]:
import seaborn as sns
import matplotlib.pyplot as plt
import os
import numpy as np
import pandas as pd
from pandas.api.types import is_numeric_dtype
import os

def plot_univariate (data, x=None, y=None, color='r',save=False,
                title='New Chart', chart_type='hist', xlabel='', ylabel='',
                    save_to=os.getcwd(), log_normalise=False):
    
    
    """
    Make a univariate plot of any of these selcted types:
    
    1. bar - barchart
    
    2. hist - Histogram
    
    3. pie - Piechart
    
    4. count - Countplot
    
    
    """
    
    plt.subplots(figsize=(10,7))
    plt.title(title, fontsize=18)
    plt.xlabel(xlabel, fontsize=15)
    plt.ylabel(ylabel, fontsize=15)
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)
    
    
    if chart_type == 'hist':
        if log_normalise:
            data = np.log(data)
        plot = sns.distplot(a=data, color=color)
        if save:
            plt.savefig(fname=save_to+f'/{title}.png', format='png')
        
    return plot

def plot_bivariate(data, x=None, y=None, hue=None, 
                  color='r',save=False,
                title='New Chart', chart_type='hist',
                   xlabel='', ylabel='',
                    save_to=os.getcwd(), img_name = " ", 
                   palette={'use':False, "size":1}, log_normalise=False,
                  kind_joint_plot = 'scatter', kind_pair_plot="scatter", figsize=(10,7)):
    
    """
    Make a bivariate plot of any of the selcted types:
    
    1. bar - barchart
    
    2. scatter  - scatter plot
    
    3. cat  - catplot
    
    4. count - countplot
    
    5 joint - jointplot 
    
    6  pair - pairplot
    
    7  corr - corr_plot
    
    When calling joint_plot:
        
        kind_joint_plot is default to `scatter`
        other types include "reg", "reside", "kde", "hex"
        
    When calling pair_plot:
        
        kind_pair_plot is default to `scatter`
        other types include 'reg'
    """
    def plt_tweaks():
        plt.subplots(figsize= figsize)
        plt.title(title, fontsize=18)
        plt.xlabel(xlabel, fontsize=15)
        plt.ylabel(ylabel, fontsize=15)
        plt.xticks(fontsize=12)
        plt.yticks(fontsize=12)
    
    
    # define helper functions
    
    def use_palette():
        palettes = []
#        palette_to_use=[]
        if palette['use'] == True:
            palette_to_use = [palettes[i] for i in range(palette['size'])]
            
            return palette_to_use

    def log_norm():
        if log_normalise and y != None:
            y = np.log(y)
        elif log_normalise and y == None:
            data = np.log(data)
            
    def save_image():
        if save:
            if img_name != " ":
                plt.savefig(fname=save_to+"/"+img_name+'.png', format='png')
            else:
                plt.savefig(fname=save_to+f'/{title}.png', format='png')
                
        
    # make plots
    
    if chart_type == "joint":
        log_norm()
        plot = sns.jointplot(x=x, y=y, data=data,
                            height=6, ratio=5, space=0.2, kind=kind_joint_plot)
        
        save_image()
        
    if chart_type == "pair":
       # try:
        log_norm()
        if palette['use'] == True:
            palette_to_use = use_palette()
            plot = sns.pairplot(data, palette=palette_to_use, 
                            kind= kind_pair_plot,height=3, aspect=1, hue=hue)
        else:
             plot = sns.pairplot(data, 
                            kind= kind_pair_plot,height=2.5, aspect=1, hue=hue, )
        save_image()
        
    if chart_type  == "corr":
        plt_tweaks()
        corr_data = data.corr()
        corr_plot = sns.heatmap(corr_data,annot=True, fmt='.2g', center=0) 
        
    return plot

### Testing plot.py module codes

#### Module Model.py Codes

In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE, _random_over_sampler
from sklearn.preprocessing import OneHotEncoder
from imblearn.pipeline import Pipeline as ImbPipe
from sklearn.model_selection import KFold, StratifiedKFold, GridSearchCV


from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score, precision_score, recall_score
from sklearn.metrics import roc_curve, roc_auc_score, precision_recall_curve
from sklearn.model_selection import cross_val_predict, cross_val_score, cross_validate 

def plot_pca_components(data):
    pca = PCA().fit(data)
    plt.plot(np.cumsum(pca.explained_variance_ratio_))
    plt.xlabel('number of components')
    plt.ylabel('cumulative explained variance');
    
def check_imbalance(data,label='', x=0.7, y=30000):
    plt.subplots(figsize=(10,8))
    data[label].value_counts().plot(kind='bar')
    text = f'Class Imbalance Count:\n\n{data[label].value_counts().to_dict()}'
    plt.text(x=x, y=y, s = text ,  fontsize=15)
    
def encode (data):
    ohe = OneHotEncoder(sparse=False, handle_unknown='ignore', )
    to_encode = data.select_dtypes(exclude='number')
    if data.shape[1] > 1:
        #ohe = MultiLabelBinarizer()
        data.drop(to_encode.columns.tolist(), axis=1, inplace = True)
        features_cat_encode = pd.DataFrame(ohe.fit_transform(to_encode))
        data = data.merge(features_cat_encode, left_index=True, right_index=True)
        #print(ohe.classes_) 
    else:
        data = pd.DataFrame(ohe.fit_transform(to_encode))
        print(ohe.categories_) 
    return data 

 

def x_y_split(data, x=None, y=None, type_="single", test_size=.10):
    
    """
    Single type divides into just x and y
    Double type divides into train and test for each of x and y
    """
    
    X, y = data.drop(columns=y, axis=1), data[y]
    
    if type_ == "single":
        
        return X, y
    
    if type == "double":
        X_train, X_test, y_train, y_test = train_test_split(X, y,
                                               test_size=test_size, random_state=123)
        
        return X_train, X_test, y_train, y_test
    
    
    
# def model_pipeline(X_train=None, y_train=None, X_test=None, pca=PCA(), 
#                    cv=StratifiedKFold(), imb_sample=SMOTE(random_state=123),
#                   model=LogisticRegressionCV()):
    
#     """
#     Trains a model for an imbalanced class using the specified estimator
#     The training is done in K-folds or its nuances as specified folds 
#     applying the specified sampling strategy
#     """
    
#     model = ImbPipe([('imb_sample', imb_sample), ('pca', pca), ('model', model)])
#     model.fit(X_train, y_train) 
#     y_hat = model.predict(X_test) 
#     return model, y_hat
    
    
def gridSearch(model,hyper_params={},cv=StratifiedKFold(), x_train=None, y_train=None):
    
    """
    Performs GridSeach of the best hyperparmaters for the passed model
    """
    
    search = GridSearchCV(model=model, param_grid = hyper_params, n_jobs=-1, cv=cv)
    search.fit(X=x_train, y=y_train)
    print("Best parameter (CV score=%0.3f):\n" % search.best_score_)
    print(search.best_params_)
    print(search.score) 
    return search


def plot_grid_search(search_obj, pca_obj, X_train):
    
    """
    Prints the best (optimised) hyperparmatersfor the grid search object
    and plots the optimised pca components
    """
    
    print("Best parameter (CV score=%0.3f):\n" % search.best_score_)
    print("Best Params:",search.best_params_)
    pca.fit(X_train_scaled)

    fig, (ax0, ax1) = plt.subplots(nrows=2, sharex=True, figsize=(8, 8))
    ax0.plot(np.arange(1, pca.n_components_ + 1),
             pca.explained_variance_ratio_, '+', linewidth=2)
    ax0.set_ylabel('PCA explained variance ratio')

    ax0.axvline(search.best_estimator_.named_steps['pca'].n_components,
                linestyle=':', label='n_components chosen')
    ax0.legend(prop=dict(size=12))

    # For each number of components, find the best classifier results
    results = pd.DataFrame(search.cv_results_)
    components_col = 'param_pca__n_components'
    best_clfs = results.groupby(components_col).apply(
        lambda g: g.nlargest(1, 'mean_test_score'))

    best_clfs.plot(x=components_col, y='mean_test_score', yerr='std_test_score',
                   legend=False, ax=ax1)
    ax1.set_ylabel('Classification accuracy (val)')
    ax1.set_xlabel('n_components')

    plt.xlim(-1, 70)

    plt.tight_layout()
    plt.show() 
    

class metrics ():
    
    def __init__(self, y_test, y_hat):
        pass
        self.y_test = y_test
        self.y_hat =  y_hat
        
    
    def class_report(self):
        
        full_report = classification_report(self.y_test, self.y_hat)
        
        print(full_report)
        
    def conf_matrix(self):
        
        conf_matrix = confusion_matrix(self.y_test, self.y_hat)
        
        conf_matrix_df = pd.DataFrame(conf_matrix, columns=['Actual_+ve', 'Actual_-ve'],
                               index=['predicted_+ve', 'predicted_-ve'])
        
        return conf_matrix_df
    
    def accuracy_score(self):
        return  accuracy_score(self.y_test, self.y_hat)
    
    def classification_error(self):
        
        return 1 - accuracy_score() 
        
    def specif_sensitiv(self):
        
        """
        Sensitivity: When the actual value is positive, how often is the prediction correct?
        
        Specificity: When the actual value is negative, how often is the prediction correct?
        """
        
        conf_matrix = confusion_matrix(self.y_test, self.y_hat)
        
        TP = conf_matrix[1, 1]
        TN = conf_matrix[0, 0]
        FP = conf_matrix[0, 1]
        FN = conf_matrix[1, 0]
        
        sensitivity = TP / float(FN + TP)
        specificity = TN / (TN + FP)
        
        sensitiv_specific_table = pd.DataFrame([[sensitivity, specificity]],
                                               columns=['sensitivity', 'specificity'])
        
        return sensitiv_specific_table

In [155]:
LogisticRegressionCV()

LogisticRegressionCV()

### Testing model.py codes

### Module Pipeline.py Codes