In [10]:
%run utils.ipynb
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report 


from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns

In [13]:
class Models:
    """
        Class to handle model related building and execution
    """
    def __init__(self, df : pd.DataFrame, test_size : float=0.2, balance : bool=False):
        """ Output label **MUST ALWAYS** be first column of dataframe """
        self.df = df
        self.test_size = test_size
        self.random_state = 111
        
        self.encode_to_labels([col for col in self.df.columns if df[col].dtype == 'object'])
        if balance:
            self.split_stratify(self.df.columns[0])
        else:
            self.split_stratify()
        
        
    def split_stratify(self, balance_by=None):
        """
            Train test split stratified by output label.
            To have same samples per class, use balance_by
        """
#         print("** Unbalanced stratified per class train_test split via '#text-area-test'")
#         train = self.df[self.df['box_id'] != 3]
#         test = self.df[self.df['box_id'] == 3]
#         print(train.shape, test.shape)
#         self.X_train, self.X_test, self.y_train, self.y_test = train.iloc[:,1:], test.iloc[:,1:], train.iloc[:,0], test.iloc[:,0]
#         print(self.X_train.shape, self.X_test.shape, self.y_train.shape, self.y_test.shape)
        if balance_by:
            grouped = self.df.groupby(balance_by)
            balance_df = grouped.apply(lambda x : x.sample(grouped.size().min())).reset_index(drop=True)
            self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(balance_df.iloc[:,1:], balance_df.iloc[:,0], test_size=self.test_size, stratify=balance_df.iloc[:,0] ,random_state=self.random_state)
            print("** Balanced stratified per class train_test split")
            print(self.X_train.shape, self.X_test.shape, self.y_train.shape, self.y_test.shape)
        else:
            print("** Unbalanced stratified per class train_test split")
            self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.df.iloc[:,1:], self.df.iloc[:,0], test_size=self.test_size, stratify=self.df.iloc[:,0], random_state=self.random_state)
            print(self.X_train.shape, self.X_test.shape, self.y_train.shape, self.y_test.shape)
            
        
    def encode_to_labels(self, feature_list):
        """ Encodes all features provided in input list format.
            Returns back encoded dataframe and map of of feature -> {label : encoded_class}
        """
        self.feature_encoded = {}
        self.labelEncoder = preprocessing.LabelEncoder()
        for feature in feature_list:
            self.df[feature] = self.labelEncoder.fit_transform(self.df[feature])
            self.feature_encoded[feature] = {self.labelEncoder.classes_[i] : i for i in range(len(self.labelEncoder.classes_))}
#         return df, feature_encoded
        print("** Non-numeric columns encoded")
    
    def accuracy_score(self, y_pred):
        return metrics.accuracy_score(self.y_test, y_pred).round(2)
    
    def confusion_matrix(self, y_pred):
        return metrics.confusion_matrix(self.y_test, y_pred)
        
    def NB(self):
        #Create a Gaussian Classifier
        gnb = GaussianNB()
        gnb.fit(self.X_train, self.y_train)
        y_pred = gnb.predict(self.X_test)
        
        return gnb, y_pred
    
    def LR(self):
        logReg = LogisticRegression()
        logReg.fit(self.X_train, self.y_train)
        y_pred = logReg.predict(self.X_test)
#         print(logReg.score(self.X_test, self.y_test))
        return logReg, y_pred

    def RF(self):
        RSEED = 50
        rf = RandomForestClassifier(n_estimators=100, 
                                       random_state=RSEED, 
                                       max_features = 'sqrt',
                                       n_jobs=-1, verbose = 1)

        rf.fit(self.X_train, self.y_train)
        y_pred = rf.predict(self.X_test)
        
        n_nodes = []
        max_depths = []

        # Stats about the trees in random forest
        for ind_tree in rf.estimators_:
            n_nodes.append(ind_tree.tree_.node_count)
            max_depths.append(ind_tree.tree_.max_depth)

        print(f'Average number of nodes {int(np.mean(n_nodes))}')
        print(f'Average maximum depth {int(np.mean(max_depths))}')
        
        feature_imp = pd.Series(rf.feature_importances_, index=self.X_train.columns).sort_values(ascending=False)
        print(feature_imp)
        return rf, y_pred
    
    def class_score(self, y_pred, model_name):
        """
            Get truth score for each class prediction
        """
        rows = []
        columns = ['user_id','name','model_name','test_count','predicted','truth']
        confusion_matrix = self.confusion_matrix(y_pred)
        confusion_matrix_sums = confusion_matrix.sum(axis=1)
        labels = {v:k for k,v in self.feature_encoded['name'].items()}
        for i in range(len(self.y_test.unique())):
            predicited = confusion_matrix[i][i]
            truth = 100*(predicited/confusion_matrix_sums[i]).round(2)
            rows.append([i, labels[i], model_name, confusion_matrix_sums[i], predicited, truth])
            
        return pd.DataFrame(rows,columns=columns)
        

    def classification_report(self, y_pred):
        return classification_report(self.y_test, y_pred)
    