In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
import os
import numpy as np
import pandas as pd
from pandas.api.types import is_numeric_dtype

In [2]:
path= '../../datasets/main_data/bank-additional-full.csv'
full_bank = pd.read_csv(path, sep=';')

#### Module data.py codes

In [60]:
%%writefile data.py
#%%writefile ../scripts/project_package/data_package/data.py

import seaborn as sns
import matplotlib.pyplot as plt 
import os
import numpy as  np
import pandas as pd
import pandas
from pandas.api.types import is_numeric_dtype
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import *
from sklearn.model_selection import *
from sklearn.preprocessing import *
from sklearn.base import *


class WrangleData():
    
    def __repr__(self):
        
        return "Used to prepare data for wrangling"
    
    def  __init__(self):
        
        pass
        
    
    
    def load_data(self, path="", sep=",", cols_to_drop=[]):
                
        self.path = path
        self.cols_to_drop = cols_to_drop 
        self.sep = sep  
            
        try :
            self.data = pd.read_csv(path, sep)
            
            if len(self.cols_to_drop) > 0:
                for col in self.cols_to_drop:
                    self.data.drop(col, axis=1, inplace=True)
                        
            self._fit() 
            
            return self.data 
        
        except:
            
            "No data path was passed upon call of method load_data"
    
    def _fit(self):
        
        try:
            assert(type(self.data) is pandas.core.frame.DataFrame), "data must be of type pandas.DataFrame"
            
            print("You are now fit to use this object for wrangling")
                    
        except AttributeError:
            
            print("Hey Buddy you need to load a data first !!! ")
        

    def get_data(self):
        
        return self.data

    def check_outliers(self, show_plot=False, save_img=os.getcwd()+'/outliers.png'):
 
        """
        This functions checks for columns with outlers using the IQR method

        It accespts as argmuent a dataset. 
        show_plot can be set to True to output pairplots of outlier columns    
        """

        self.outliers = [] 
        Q1 = self.data.quantile(0.25)  
        Q3 = self.data.quantile(0.75)
        IQR = Q3 - Q1
        num_data = self.data.select_dtypes(include='number')
        result = dict ((((num_data < (Q1 - 1.5 * IQR)) | (num_data > (Q3 + 1.5 * IQR)))==True).any())
        #data[(data[col] >= high)|(data[col] <= low)].index
        index = self.data[(num_data < Q1 - 1.5 * IQR) | (num_data > Q3 + 1.5 * IQR)].index
        for k,v in result.items():
            if v == True:  
                self.outliers.append(k)
        if show_plot:
            self.outlier_pair_plot = sns.pairplot(self.data[self.outliers]);
            print(f'{result},\n\n Visualization of outlier columns')
            plt.savefig(fname=save_img, format='png')
            return  self.outlier_pair_plot
        else:
            return self.data.loc[index, self.outliers] 
        
                
                
        
        
    def treat_outliers(self, type_='median_replace'):
            
        """
        This treat outliers using any ofthses 3 methods as specified by user

            1. median_replace -  median replacement

            2. quant_floor - quantile flooring

            3. trim - trimming 

            4. log_transform - log transformations
            
            5. isf    -       IsolationForest (also like trimming)

        The methods are some of the commont statistical methods in treating outler
        columns

        By default treatment type is set to median replacement

        """

        if type_ == "median_replace":

            for col in self.data.columns.tolist():
                if is_numeric_dtype(self.data[col]):
                    median = (self.data[col].quantile(0.50))
                    q1 = self.data[col].quantile(0.25)
                    q3 = self.data[col].quantile(0.75)
                    iqr = q3 - q1
                    high = int(q3 + 1.5 * iqr) 
                    low = int(q1 - 1.5 * iqr)
                    self.data[col] = np.where(self.data[col] > high, median, self.data[col])
                    self.data[col] = np.where(self.data[col] > high, median, self.data[col])        

        if type_ == "quant_floor":

            for col in self.data.columns.tolist():
                if is_numeric_dtype(data[col]):
                    q_10 = self.data[col].quantile(0.5)
                    q_90 = self.data[col].quantile(0.95)
                    self.data[col] =  self.data[col] = np.where(self.data[col] < q_10, q_10 , self.data[col])
                    self.data[col] =  self.data[col] = np.where(self.data[col] > q_90, q_90 , self.data[col])

        if type_ == "trim": 

            for col in self.data.columns.tolist():
                low = .05
                high = .95
                quant_df = self.data.quantile([low, high])
                for name in list(self.data.columns):
                    if is_numeric_dtype(self.data[name]):
                        self.data = self.data[(self.data[name] >= quant_df.loc[low, name]) 
                            & (self.data[name] <= quant_df.loc[high, name])]

        if type_ == "log_transform":  
            for col in self.data.columns.tolist():
                if is_numeric_dtype(self.data[col]):
                    self.data[col] = self.data[col].map(lambda i: np.log(i) if i > 0 else 0)

        if type_ == "isf":
            iso = IsolationForest(contamination=0.1)
            yhat = iso.fit_predict(self.data.select_dtypes(exclude='object'))
            #select all rows that are not outliers
            mask = yhat != -1 
            self.data = self.data[mask]


        return self.data  
    
    
    def map_col_values(self, col_name="", values_dict={}):
        
        """
        replace values in a series (values_dict.keys) with specified values from (values_dict.values)
        """

        self.data[col_name] = self.data[col_name].map(values_dict)

        return self.data
    
    
    def split_data_single(self, target_cols=[]):
            
        self.split1 = self.data.drop(columns=target_cols, axis=1) 

        self.split2   = pd.DataFrame(self.data[target_cols])

        return self.split1, self.split2
    
    
    def encode (self, use_split1=False, use_split2 = False, use_data=False): 
        
        if use_data:
      
            ohe = OneHotEncoder(sparse=False, handle_unknown='ignore', )
            to_encode = self.data.select_dtypes(exclude='number')
            if self.data.shape[1] > 1:
                #ohe = MultiLabelBinarizer()
                self.data.drop(to_encode.columns.tolist(), axis=1, inplace = True)
                features_cat_encode = pd.DataFrame(ohe.fit_transform(to_encode))
                self.data = self.data.merge(features_cat_encode, left_index=True, right_index=True)
               # print(ohe.classes_) 
            else: 
                self.data = pd.DataFrame(ohe.fit_transform(to_encode))
                print(ohe.categories_) 

            return self.data
        
        
        if use_split1:
      
            ohe = OneHotEncoder(sparse=False, handle_unknown='ignore', )
            to_encode = self.split1.select_dtypes(exclude='number')
            if self.split1.shape[1] > 1:
                #ohe = MultiLabelBinarizer()
                self.split1.drop(to_encode.columns.tolist(), axis=1, inplace = True)
                features_cat_encode = pd.DataFrame(ohe.fit_transform(to_encode))
                self.split1 = self.split1.merge(features_cat_encode, left_index=True, right_index=True)
               # print(ohe.classes_) 
            else: 
                self.split1 = pd.DataFrame(ohe.fit_transform(to_encode))
                print(ohe.categories_) 

            return self.split1
        
        
        if use_split2:
      
            ohe = OneHotEncoder(sparse=False, handle_unknown='ignore', )
            to_encode = self.split2.select_dtypes(exclude='number')
            if self.split2.shape[1] > 1:
                #ohe = MultiLabelBinarizer()
                self.split2.drop(to_encode.columns.tolist(), axis=1, inplace = True)
                features_cat_encode = pd.DataFrame(ohe.fit_transform(to_encode))
                self.split2 = self.split2.merge(features_cat_encode, left_index=True, right_index=True)
               # print(ohe.classes_) 
            else: 
                self.split2 = pd.DataFrame(ohe.fit_transform(to_encode))
                print(ohe.categories_) 

            return self.split2
            

    def scale_data(self, scaler=RobustScaler(),
                  use_data=False, use_split1= False, use_split2 = False):
        
        """
            Specify scaler type, scaler type must have fit_transform as a method
        """
        
        if use_data:
            self.data = scaler.fit_transform(self.data)
            return self.data 
        
        if use_split1:
            self.split1 = scaler.fit_transform(self.split1)
            return self.split1
        
        if use_split2:
            self.split2 = scaler.fit_transform(self.split2)
            return self.split2
            

Overwriting data.py


#### Module uni_plot.py codes

In [45]:
%%writefile uni_plot.py
#%%writefile ../scripts/project_package/plot_package/uni_plot.py

import seaborn as sns
import matplotlib.pyplot as plt
from seaborn import *
import os
import numpy as np 
import pandas as pd
from pandas.api.types import is_numeric_dtype

def plot_univariate (data, x=None, y=None, color='r',save=False,
                title='New Chart', chart_type='hist', xlabel='', ylabel='',
                    save_to=os.getcwd(), log_normalise=False):
    
    
    """
    Make a univariate plot of any of these selcted types:
    
    1. bar - barchart
    
    2. hist - Histogram
    
    3. pie - Piechart
    
    4. count - Countplot
    
    
    """
    
    plt.subplots(figsize=(10,7))
    plt.title(title, fontsize=18)
    plt.xlabel(xlabel, fontsize=15)
    plt.ylabel(ylabel, fontsize=15)
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)
    
    
    if chart_type == 'hist':
        if log_normalise:
            data = np.log(data)
        plot = sns.distplot(a=data, color=color)
        if save:
            plt.savefig(fname=save_to+f'/{title}.png', format='png')
        
    return plot

Overwriting uni_plot.py


#### Module bi_plot.py codes

In [48]:
%%writefile bi_plot.py
#%%writefile ../scripts/project_package/plot_package/bi_plot.py

import seaborn as sns
import matplotlib.pyplot as plt
from seaborn import *
import os
import numpy as np
import pandas as pd
from pandas.api.types import is_numeric_dtype


def plot_bivariate(data, x=None, y=None, hue=None, 
                  color='r',save=False,
                title='New Chart', chart_type='hist',
                   xlabel='', ylabel='',
                    save_to=os.getcwd(), img_name = " ", 
                   palette={'use':False, "size":1}, log_normalise=False,
                  kind_joint_plot = 'scatter', kind_pair_plot="scatter", figsize=(10,7)):
    
    """
    Make a bivariate plot of any of the selcted types:
    
    1. bar - barchart
    
    2. scatter  - scatter plot
    
    3. cat  - catplot
    
    4. count - countplot
    
    5 joint - jointplot 
    
    6  pair - pairplot
    
    7  corr - corr_plot
    
    When calling joint_plot:
        
        kind_joint_plot is default to `scatter`
        other types include "reg", "reside", "kde", "hex"
        
    When calling pair_plot:
        
        kind_pair_plot is default to `scatter`
        other types include 'reg'
    """
    def plt_tweaks():
        plt.subplots(figsize= figsize)
        plt.title(title, fontsize=18)
        plt.xlabel(xlabel, fontsize=15)
        plt.ylabel(ylabel, fontsize=15)
        plt.xticks(fontsize=12)
        plt.yticks(fontsize=12)
    
    
    # define helper functions
    
    def use_palette():
        palettes = []
#        palette_to_use=[]
        if palette['use'] == True:
            palette_to_use = [palettes[i] for i in range(palette['size'])]
            
            return palette_to_use

    def log_norm():
        if log_normalise and y != None:
            y = np.log(y)
        elif log_normalise and y == None:
            data = np.log(data)
            
    def save_image():
        if save:
            if img_name != " ":
                plt.savefig(fname=save_to+"/"+img_name+'.png', format='png')
            else:
                plt.savefig(fname=save_to+f'/{title}.png', format='png')
                
        
    # make plots
    
    if chart_type == "joint":
        log_norm()
        plot = sns.jointplot(x=x, y=y, data=data,
                            height=6, ratio=5, space=0.2, kind=kind_joint_plot)
        
        save_image()
        
    if chart_type == "pair":
       # try:
        log_norm()
        if palette['use'] == True:
            palette_to_use = use_palette()
            plot = sns.pairplot(data, palette=palette_to_use, 
                            kind= kind_pair_plot,height=3, aspect=1, hue=hue)
        else:
             plot = sns.pairplot(data, 
                            kind= kind_pair_plot,height=2.5, aspect=1, hue=hue, )
        save_image()
        
    if chart_type  == "corr":
        plt_tweaks()
        corr_data = data.corr()
        corr_plot = sns.heatmap(corr_data,annot=True, fmt='.2g', center=0) 
        
    return plot

Overwriting bi_plot.py


#### Module model.py codes

In [49]:
%%writefile model.py
#%%writefile ../scripts/project_package/model_package/model.py


import pandas as pd
import numpy as np
import os

import matplotlib.pyplot as plt
from sklearn.linear_model import *
from imblearn.over_sampling import *
from imblearn.pipeline import *

from imblearn.metrics import *
from sklearn.metrics import *
from sklearn.preprocessing import *
from sklearn.decomposition import *
from sklearn.base import *
from sklearn.model_selection import * 

def plot_pca_components(data):
    pca = PCA().fit(data)
    plt.plot(np.cumsum(pca.explained_variance_ratio_))
    plt.xlabel('number of components')
    plt.ylabel('cumulative explained variance');
    
def check_imbalance(data,label='', x=0.7, y=30000):
    plt.subplots(figsize=(10,8)) 
    data[label].value_counts().plot(kind='bar')
    text = f'Class Imbalance Count:\n\n{data[label].value_counts().to_dict()}'
    plt.text(x=x, y=y, s = text ,  fontsize=15)
    
def encode (data):
    ohe = OneHotEncoder(sparse=False, handle_unknown='ignore', )
    to_encode = data.select_dtypes(exclude='number')
    if data.shape[1] > 1:
        #ohe = MultiLabelBinarizer()
        data.drop(to_encode.columns.tolist(), axis=1, inplace = True)
        features_cat_encode = pd.DataFrame(ohe.fit_transform(to_encode))
        data = data.merge(features_cat_encode, left_index=True, right_index=True)
        #print(ohe.classes_) 
    else:
        data = pd.DataFrame(ohe.fit_transform(to_encode))
        print(ohe.categories_) 
    return data 


def x_y_split(data, x=None, y=None, type_="single", test_size=.10):
    
    """
    Single type divides into just x and y
    Double type divides into train and test for each of x and y
    """
    
    X, y = data.drop(columns=y, axis=1), data[y]
    
    if type_ == "single":
        
        return X, y
    
    if type == "double":
        X_train, X_test, y_train, y_test = train_test_split(X, y,
                                               test_size=test_size, random_state=123)
        
        return X_train, X_test, y_train, y_test
    
    
def gridSearch(model,hyper_params={},cv=StratifiedKFold(), x_train=None, y_train=None):
    
    """
    Performs GridSeach of the best hyperparmaters for the passed model
    """
    
    search = GridSearchCV(model=model, param_grid = hyper_params, n_jobs=-1, cv=cv)
    search.fit(X=x_train, y=y_train)
    print("Best parameter (CV score=%0.3f):\n" % search.best_score_)
    print(search.best_params_)
    print(search.score) 
    return search


def plot_grid_search(search_obj, pca_obj, X_train):
    
    """
    Prints the best (optimised) hyperparmatersfor the grid search object
    and plots the optimised pca components
    """
    
    print("Best parameter (CV score=%0.3f):\n" % search.best_score_)
    print("Best Params:",search.best_params_)
    pca.fit(X_train_scaled)

    fig, (ax0, ax1) = plt.subplots(nrows=2, sharex=True, figsize=(8, 8))
    ax0.plot(np.arange(1, pca.n_components_ + 1),
             pca.explained_variance_ratio_, '+', linewidth=2)
    ax0.set_ylabel('PCA explained variance ratio')

    ax0.axvline(search.best_estimator_.named_steps['pca'].n_components,
                linestyle=':', label='n_components chosen')
    ax0.legend(prop=dict(size=12))

    # For each number of components, find the best classifier results
    results = pd.DataFrame(search.cv_results_)
    components_col = 'param_pca__n_components'
    best_clfs = results.groupby(components_col).apply(
        lambda g: g.nlargest(1, 'mean_test_score'))

    best_clfs.plot(x=components_col, y='mean_test_score', yerr='std_test_score',
                   legend=False, ax=ax1)
    ax1.set_ylabel('Classification accuracy (val)')
    ax1.set_xlabel('n_components')

    plt.xlim(-1, 70)

    plt.tight_layout()
    plt.show() 
    
    
class Preprocessor(BaseEstimator, TransformerMixin):
    
    def __repr__(self):
        
        return "Used to prepare data for modelling"
    
    def  __init__(self):
        
        pass
        
    
    def fit(self, data, y=None):
        
        assert(type(data) is pandas.core.frame.DataFrame), "data must be of type pandas.DataFrame"
        
        self.data = data 
        
        print("Fitted")
        
        return self 
        


    def check_outliers(self, show_plot=False, save_img=os.getcwd()+'/outliers.png'):
            
        """
        This functions checks for columns with outlers using the IQR method

        It accespts as argmuent a dataset. 
        show_plot can be set to True to output pairplots of outlier columns    
        """

        self.outliers = [] 
        Q1 = self.data.quantile(0.25)  
        Q3 = self.data.quantile(0.75)
        IQR = Q3 - Q1
        num_data = self.data.select_dtypes(include='number')
        result = dict ((((num_data < (Q1 - 1.5 * IQR)) | (num_data > (Q3 + 1.5 * IQR)))==True).any())
        #data[(data[col] >= high)|(data[col] <= low)].index
        index = self.data[(num_data < Q1 - 1.5 * IQR) | (num_data > Q3 + 1.5 * IQR)].index
        for k,v in result.items():
            if v == True:  
                self.outliers.append(k)
        if show_plot:
            self.outlier_pair_plot = sns.pairplot(self.data[self.outliers]);
            print(f'{result},\n\n Visualization of outlier columns')
            plt.savefig(fname=save_img, format='png')
            return  self.outlier_pair_plot
        else:
            return self.data.loc[index, self.outliers] 
        
        
    def treat_outliers(self, type_='median_replace'):
            
        """
        This treat outliers using any ofthses 3 methods as specified by user

            1. median_replace -  median replacement

            2. quant_floor - quantile flooring

            3. trim - trimming 

            4. log_transform - log transformations

        The methods are some of the commont statistical methods in treating outler
        columns

        By default treatment type is set to median replacement

        """

        if type_ == "median_replace":

            for col in self.data.columns.tolist():
                if is_numeric_dtype(self.data[col]):
                    median = (self.data[col].quantile(0.50))
                    q1 = self.data[col].quantile(0.25)
                    q3 = self.data[col].quantile(0.75)
                    iqr = q3 - q1
                    high = int(q3 + 1.5 * iqr) 
                    low = int(q1 - 1.5 * iqr)
                    self.data[col] = np.where(self.data[col] > high, median, self.data[col])
                    self.data[col] = np.where(self.data[col] > high, median, self.data[col])        

        if type_ == "quant_floor":

            for col in self.data.columns.tolist():
                if is_numeric_dtype(data[col]):
                    q_10 = self.data[col].quantile(0.5)
                    q_90 = self.data[col].quantile(0.95)
                    self.data[col] =  self.data[col] = np.where(self.data[col] < q_10, q_10 , self.data[col])
                    self.data[col] =  self.data[col] = np.where(self.data[col] > q_90, q_90 , self.data[col])

        if type_ == "trim": 

            for col in self.data.columns.tolist():
                low = .05
                high = .95
                quant_df = self.data.quantile([low, high])
                for name in list(self.data.columns):
                    if is_numeric_dtype(self.data[name]):
                        self.data = self.data[(self.data[name] >= quant_df.loc[low, name]) 
                            & (self.data[name] <= quant_df.loc[high, name])]

        if type_ == "log_transform":  
            for col in self.data.columns.tolist():
                if is_numeric_dtype(self.data[col]):
                    self.data[col] = self.data[col].map(lambda i: np.log(i) if i > 0 else 0)

        if type_ == "isf":
            iso = IsolationForest(contamination=0.1)
            yhat = iso.fit_predict(self.data.select_dtypes(exclude='object'))
            #select all rows that are not outliers
            mask = yhat != -1 
            self.data = self.data[mask]


        return self.data 
    
    
    def map_col_values(self, col_name="", values_dict={}):

        self.data[col_name] = self.data[col_name].map(values_dict)

        return self.data
    
    
    def split_data_single(self, target_cols=[]):
            
        self.features = self.data.drop(columns=target_cols, axis=1) 

        self.target   = pd.DataFrame(self.data[target_cols])

        return self.features, self.target
    
    
    def encode (self, data_obj=None, use_features=True, use_target=False): 
        
        if data_obj is None and use_features == False and use_target == False:
        
            ohe = OneHotEncoder(sparse=False, handle_unknown='ignore', )
            to_encode = self.data.select_dtypes(exclude='number')
            if self.data.shape[1] > 1:
                #ohe = MultiLabelBinarizer()
                self.data.drop(to_encode.columns.tolist(), axis=1, inplace = True)
                features_cat_encode = pd.DataFrame(ohe.fit_transform(to_encode))
                self.data = self.data.merge(features_cat_encode, left_index=True, right_index=True)
               # print(ohe.classes_) 
            else: 
                self.data = pd.DataFrame(ohe.fit_transform(to_encode))
                print(ohe.categories_) 
            return self.data
        
        if data_obj is not None:
        
            self.data_obj = data_obj
            print("Not None")
            
            ohe = OneHotEncoder(sparse=False, handle_unknown='ignore', )
            to_encode = self.data_obj.select_dtypes(exclude='number')
            if self.data_obj.shape[1] > 1:
                #ohe = MultiLabelBinarizer()
                self.data_obj.drop(to_encode.columns.tolist(), axis=1, inplace = True)
                features_cat_encode = pd.DataFrame(ohe.fit_transform(to_encode))
                self.data_obj = self.data_obj.merge(features_cat_encode, left_index=True, right_index=True)
               # print(ohe.classes_) 
            else:
                self.data_obj = pd.DataFrame(ohe.fit_transform(to_encode))
                print(ohe.categories_) 
            return self.data_obj
        
        if use_features:
            ohe = OneHotEncoder(sparse=False, handle_unknown='ignore', )
            to_encode = self.features.select_dtypes(exclude='number')
            if self.features.shape[1] > 1:
                #ohe = MultiLabelBinarizer()
                self.features.drop(to_encode.columns.tolist(), axis=1, inplace = True)
                features_cat_encode = pd.DataFrame(ohe.fit_transform(to_encode))
                self.features = self.features.merge(features_cat_encode, left_index=True, right_index=True)
               # print(ohe.classes_) 
            else: 
                self.features = pd.DataFrame(ohe.fit_transform(to_encode))
                print(ohe.categories_) 
            return self.features
        
        if use_target:
            
            ohe = OneHotEncoder(sparse=False, handle_unknown='ignore', )
            to_encode = self.target.select_dtypes(exclude='number')
            if self.target.shape[1] > 1:
                #ohe = MultiLabelBinarizer()
                self.target.drop(to_encode.columns.tolist(), axis=1, inplace = True)
                features_cat_encode = pd.DataFrame(ohe.fit_transform(to_encode))
                self.target = self.target.merge(features_cat_encode, left_index=True, right_index=True)
               # print(ohe.classes_) 
            else: 
                self.target = pd.DataFrame(ohe.fit_transform(to_encode))
                print(ohe.categories_) 
            return self.target
            
    
    def split_data_double(self, features_=pd.DataFrame([[]]), target_=pd.DataFrame([[]]), 
                          test_size=.10, use_native=True):
        
        if use_native == False:
        
            if features.shape[0] != target.shape[0]:
                
                raise Exception("Wrong, you are trying to pass unequal shapes\n\
                Shapes of dataframes must be equal\n\
                Try target = target.iloc[0:features.shape[0]]")

            self.features_ = features_
            self.target_ = target_

            self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.features_, 
                                                                                    self.target_,
                                           test_size= test_size, random_state=24)

            return self.X_train, self.X_test, self.y_train, self.y_test
        
        if use_native:
            
            self.target = self.target.iloc[0:self.features.shape[0]]
            
            self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.features, self.target,
                                           test_size= test_size, random_state=24)

            return self.X_train, self.X_test, self.y_train, self.y_test
        
    
    
    


    def scale_data(self, scale_data=pd.DataFrame([[]]),
                   scaler=RobustScaler(), use_features=True,
                  use_target=False, use_data=False):
        
        """
            Specify scaler type, scaler type must have fit_transform as a method
        """
        
        if use_features:
    
            self.features = scaler.fit_transform(self.features)

            return self.features
        
        if use_target:
            
            self.target = scaler.fit_transform(self.target)

            return self.target
        
        if use_data:
            
            self.data = scaler.fit_transform(self.data)

            return self.data
        
        if use_data == False and use_features == False and use_target == False:
            
            self.scale_data = scale_data
            
            self.scale_data = scaler.fit_transform(self.scale_data)

            return self.scale_data
            
    def transform(self, X):
        
        """
        Ideally, a preapred trainX data ought to be passed to in case of passing into a pipeline
        """
        
        self.data = X
                
        self.data = self.treat_outliers(type_="isf") 
        
        #self.data = self.map_col_values(col_name="y", values_dict={"no":0, "yes":1})
        
       # self.features, self.target = self.split_data_single(target_cols=["y"])
        #print(self.features)
                
        self.features = self.encode(self.features)
      #  self.target = self.target.iloc[0:self.features.shape[0], 0:]
        #print(self.target)
       # self.X_train, self.X_test, self.y_train, self.y_test = self.split_data_double(
        #    self.features, self.target, test_size=.10)
        
        scaler=RobustScaler() 
            
        X = scaler.fit_transform(self.X_train)
        
        return X
    
    
    def fit_transform(self, X, y=None):
        
        self.X = X
        
        return self.transform(self.X) 

Overwriting model.py


#### Module model_metrics.py

In [50]:
#%%writefile model_metrics.py
#%%writefile ../scripts/project_package/model_package/model_metrics.py

from imblearn.metrics import *
from sklearn.metrics import *
from sklearn.preprocessing import *
from sklearn.decomposition import *
from sklearn.base import *
from sklearn.ensemble import RandomForestClassifier

import pandas as pd

class metrics ():
    
    def __init__(self, X_train, y_train, X_test, y_test, y_hat=None):
        
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test
        self.y_hat =  y_hat
        
    
    def class_report(self):
        
        full_report = classification_report(self.y_test, self.y_hat)
        
        print(full_report)
        
    def conf_matrix(self):
        
        conf_matrix = confusion_matrix(self.y_test, self.y_hat)
        
        conf_matrix_df = pd.DataFrame(conf_matrix, columns=['Actual_+ve', 'Actual_-ve'],
                               index=['predicted_+ve', 'predicted_-ve'])
        
        return conf_matrix_df
    
    def accuracy_score(self):
        return  accuracy_score(self.y_test, self.y_hat)
    
    def classification_error(self):
        
        return 1 - accuracy_score() 
        
    def specif_sensitiv(self):
        
        """
        Sensitivity: When the actual value is positive, how often is the prediction correct?
        
        Specificity: When the actual value is negative, how often is the prediction correct?
        """
        
        conf_matrix = confusion_matrix(self.y_test, self.y_hat)
        
        TP = conf_matrix[1, 1]
        TN = conf_matrix[0, 0]
        FP = conf_matrix[0, 1]
        FN = conf_matrix[1, 0]
        
        sensitivity = TP / float(FN + TP)
        specificity = TN / (TN + FP)
        
        sensitiv_specific_table = pd.DataFrame([[sensitivity, specificity]],
                                               columns=['sensitivity', 'specificity'])
        
        return sensitiv_specific_table
    
    
    def evaluate_classifier(clf, df_scores, X_train, y_train, X_test, y_test, clf_name=None):
    
        """
        Returns a dataframe of unbalanced and balanced acuuracy score of estimators used
        Run for the first time, pass an empty dataframe of df_scorees 
        and when running on more estimators, pass the previous df_scores dataframe for a 
        single table of evaluation scores

        Example given below:

                        LogisticRegressionCV
                Accuracy 	        0.908
                Balanced accuracy 	0.842

        """
        from imblearn.pipeline import Pipeline as ImbPipe
        from sklearn.pipeline import Pipeline as Pipe
        if clf_name is None:
            if isinstance(clf, ImbPipe) or isinstance(clf, Pipe):
                clf_name = clf[-1].__class__.__name__
            else:
                clf_name = clf.__class__.__name__
        acc = clf.fit(X_train, y_train).score(X_test, y_test)
        y_pred = clf.predict(X_test)
        bal_acc = balanced_accuracy_score(y_test, y_pred)
        clf_score = pd.DataFrame(
            {clf_name: [acc, bal_acc]},
            index=['Accuracy', 'Balanced accuracy']
        )
        df_scores = pd.concat([df_scores, clf_score], axis=1).round(decimals=3)
        return df_scores
    
    def plot_roc():
        pass

Overwriting model_metrics.py


In [21]:
#%%writefile main.py
#%%writefile ../scripts/project_package/main.py


import pandas as pd
import numpy as np
%matplotlib inline 
import matplotlib.pyplot as plt
import seaborn as sns 
from pandas.api.types import is_numeric_dtype 

from plot import plot_univariate, plot_bivariate
from data import WrangleData
from model import check_imbalance, plot_pca_components, encode, x_y_split

from sklearn.preprocessing import StandardScaler,RobustScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder, MultiLabelBinarizer
from imblearn.over_sampling import SMOTE, _random_over_sampler
from sklearn.decomposition import PCA
from sklearn.feature_selection import f_classif, from_model, SelectKBest,chi2, mutual_info_classif
from sklearn.model_selection import train_test_split
from sklearn.ensemble import IsolationForest # outlier detection and re,oval
from collections import Counter

from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.model_selection import KFold, StratifiedKFold, GridSearchCV
import xgboost
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score, precision_score, recall_score
from sklearn.metrics import roc_curve, roc_auc_score, precision_recall_curve
from sklearn.model_selection import cross_val_predict, cross_val_score, cross_validate

from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.model_selection import KFold, StratifiedKFold, GridSearchCV
import xgboost
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from imblearn.pipeline import Pipeline as ImbPipe
import joblib


if __name__ == "__main__":
    pass



Module pipeline.py codes

In [35]:
%%writefile pipeline.py
#%%writefile ../scripts/project_package/model_package/pipeline.py 
from sklearn.pipeline import FeatureUnion, Pipeline  
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline, make_pipeline

Overwriting pipeline.py
