In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import roc_curve, auc
def create_bins(bins,y):
    if bins==2:
        y[y<4]=0
        y[y>=4]=1
    if bins==3:
        y[y<4]=0
        y[(y>=4) & (y<8)]=1
        y[y>=8]=2
    if bins==5:
        y[y<2]=0
        y[(y>=2) & (y<4)]=1
        y[(y>=4) & (y<6)]=2
        y[(y>=6) & (y<8)]=3
        y[y>=8]=4
    return y

def input_label_split(train_set, test_set):
    num_of_columns = len(train_set.columns)
    X_train = train_set.iloc[:,0:num_of_columns-1]
    y_train = train_set.iloc[:,-1]
    X_test = test_set.iloc[:,0:num_of_columns-1]
    y_test = test_set.iloc[:,-1]
    return (X_train, y_train, X_test, y_test)

In [4]:
def remove_negatives_from_column(data, column_name):
    return data[data[column_name]>=0]

def remove_null_from_column(data, column_name):
    return data[data[column_name].notnull()]

def add_pay_per_hr_column(data, column_name):
    data[column_name]=data['pay']/(data['work_time']/3600)
    return data

def filter_data_by_column_names(data, columns_to_keep):
    col_idx_list = []
    for column in columns_to_keep:
        col_idx_list.append(data.columns.get_loc(column))    
    return data.iloc[:,col_idx_list]

def filter_data_by_column(data, column_name, value, logic):
    if logic=='less_than':
        return data[data[column_name]<value]
    if logic=='less_or_equal':
        return data[data[column_name]<=value]
    if logic=='greater_than':
        return data[data[column_name]>value]
    if logic=='greater_equal':
        return data[data[column_name]>=value]
    if logic=='equal':
        return data[data[column_name]==value]

def log_of_column(data, column_name):
    new_data = data.copy()
    new_data[column_name] = np.log10(new_data[column_name])
    return new_data

def floor_column_values(data, column_name):
    np.floor(data['work_time'])
    data['work_time'] = data['work_time'].astype('int')
    return data

def one_hot_encoding(data, column_name):
    col_idx = data.columns.get_loc(column_name)
    value_counts = len(data[column_name].value_counts())
    one_hot_columns_list = [] # this will contain the names of new columns formed after one hot encoding
    one_hot_encoder = OneHotEncoder(categorical_features = [col_idx])
    data_array = one_hot_encoder.fit_transform(data).toarray()
    data_array = data_array[:,1:]
    for idx in range(1,value_counts):
        one_hot_columns_list.append(column_name+'_'+str(idx))
    #one_hot_columns_list = ['work_time_1','work_time_2'] # 'work_time_3','work_time_4'
    data_encoded = pd.DataFrame(data = data_array, columns = one_hot_columns_list + list(data.columns[1:]))
    return data_encoded

def plot_roc_2_class(y_test_array, rfc_y_pred):
   
    fpr, tpr, _ = roc_curve(y_test_array, rfc_y_pred)
    roc_auc = auc(fpr, tpr)
    # Compute micro-average ROC curve and ROC area
    fpr_micro, tpr_micro, _ = roc_curve(y_test_array.ravel(), rfc_y_pred.ravel())
    roc_auc_micro = auc(fpr_micro, tpr_micro)

    # Plot ROC
    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()