# Importing Required Packages

In [42]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [43]:
import math
import statistics
pd.set_option('display.max_columns',None)

In [44]:
sns.set_style('darkgrid')
plt.rcParams['figure.figsize'] = (14,9)

In [45]:
print("Order of functions for Machine Learning:\n\n1. impute_cols\n\n2. Optional: remove_high_corr_cols and/or combine_dummy_variable_columns\n\n3. convert_cat\n\n4. find_cols_to_ignore (to be used only with normalize_df function)\n\n5. shuffled_split_dfs / split_dfs (returns a list of two dataframes, train_df and test_df)\n\n6. normalize (need to do for both train_df and test_df separately so there is no data leakage from test_df into train_df)\n\n7. stoch_grad_desc (input train_df as the dataset parameter)\n\n8. make_predictions (input test_df as the dataset parameter)\n\n9. check_accuracy")

Order of functions for Machine Learning:

1. impute_cols

2. Optional: remove_high_corr_cols and/or combine_dummy_variable_columns

3. convert_cat

4. find_cols_to_ignore (to be used only with normalize_df function)

5. shuffled_split_dfs / split_dfs (returns a list of two dataframes, train_df and test_df)

6. normalize (need to do for both train_df and test_df separately so there is no data leakage from test_df into train_df)

7. stoch_grad_desc (input train_df as the dataset parameter)

8. make_predictions (input test_df as the dataset parameter)

9. check_accuracy


# Common Fuctions Across Algorithms

In [46]:
def create_random_df(model,nrows,ncols,nclasses = None,target_multiplier = 1):
    """
    Creates a random Pandas DataFrame based on input values for model,nrows,ncols,nclasses,andtarget_multiplier
    """
    
    features = ['feat_%s' % col for col in range(1,ncols)]
    #return [features + ['Target']]
    df = pd.DataFrame(np.random.rand(nrows,ncols),columns = features + ['Target'])
    #return df
    if model.lower() in ['lin','linear regression','reg','regression']:
        df['Target'] *= target_multiplier
    else:
        if model.lower() in ['log','logistic regression','logreg']:
            nclasses = 2
        df['Target'] = df['Target'].apply(lambda x:np.random.choice(range(nclasses)))
    return df

In [47]:
def find_cols_to_ignore(df,thresh_cat = 0.02,cols_to_ignore = None):
    """
    Finds columns to ignore for normalization
    ONLY USE FOR normalize FUNCTION, NOT FOR impute_cols FUNCTION
    """
    
    if cols_to_ignore == None:
        cols_to_ignore = []
    elif type(cols_to_ignore) == str:
        cols_to_ignore = [cols_to_ignore]
    cols = list(df.columns)
    for col in cols:
        if df[col].dtype.name not in ['int32','int64','float32','float64'] and col not in cols_to_ignore:
            cols_to_ignore.append(col)
        elif df[col].dtype.name in ['int32','int64','float32','float64'] and col not in cols_to_ignore:
            num_unique = df[col].value_counts().count()
            num_total = df[col].count()
            if (num_unique / num_total) < thresh_cat:
                cols_to_ignore.append(col)
    return list(set(cols_to_ignore))
            

In [48]:
def impute_cols(df,k=None,null_cols = 'auto_select',thresh_null_cat = 0.02,cols_to_ignore = None):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    """
    This function takes in a dataframe, imputes all the columns which have nulls, and returns a fully cleansed dataframe
    Parameters:
    1. df: a Pandas DataFrame
    2. k: this parameter is the k number of neighbors used in the KNN algorithm to classify null values in a categorical column.
       If k == None, then k will be calculated for each null_cat_column as the square root of the number of 
       non null values (labeled values) in that particular column. Default is None.
    3. null_cols: default is 'auto_select' meaning function automatically will impute all columns with null values.  
       Can specify one column as a string or one or more multiple columns as a list instead.
    4. thresh_null_cat: this parameter determines the minimum ratio of unique_non_null_non_zero values to total_non_null_non_zero
       values for each column in order for that column to count as a discrete column. Default is 0.03
    5. cols_to_ignore: columns that should be ignored when normalizing.  Non-numerical columns will automatically be ignored.
       Default is None.
       
       **NOTE: IF ALL THE COLUMNS IN THE DATAFRAME ARE CATEGORICAL, NEED TO INCLUDE NUMBERICAL COLUMNS FOR THIS FUNCTION TO WORK.
    """
    
    ### This function determines the number of unique values in each column passed in the parameter and return a dictionary
    ### containing the column names as keys and the unique values as values
    from time import time
    func_start_time = time()
    def num_unique_values_in_cols(df,cols = None): 
        if type(cols) == str:
            cols = [cols]
        elif cols == None:
            cols = list(df.columns)

        d_unique = {}
        for col in cols:
            d_unique[col] = df[col].value_counts().count()
        return d_unique
    
    ### This function is the KNN algorithm used to impute categorical columns that contain null values
    def knn(df_train,new_pts_list,dep_col,k = k,only_pred_df = True):
        start_time = time()
        import statistics
        import numpy as np

        nrows_new_pts = new_pts_list.shape[0]
        loc_dep_col = list(df_train.columns).index(dep_col)
        df_in = df_train[list(df_train.columns)[0:loc_dep_col] + list(df_train.columns)[loc_dep_col + 1:] + list(df_train.columns)[loc_dep_col:loc_dep_col + 1]] 

        if type(new_pts_list) == type(df_in):
            l_list_new_pts = []

            for rn in range(new_pts_list.count()[0]):
                l_list_new_pts.append(list(new_pts_list.iloc[rn,:]))
            new_pts_list = l_list_new_pts

        def knn_1pt(df,new_point,output_col = dep_col,k = k):

            df1 = df.copy()
            new_pt_df = pd.DataFrame(data = [new_point + ['DK']],columns=df1.columns)
            #return new_pt_df
            df_out = df1.append(new_pt_df,ignore_index = True)
            #return df_out

            df2 = df_out.drop(output_col,axis=1)
            #return df2
            count_rows = df2.count()[0]
            new_pt_ind = count_rows - 1

            dist_list = np.square((np.matrix(df2[:new_pt_ind]) - np.array(df2[new_pt_ind:]))).sum(axis = 1).transpose()[0].tolist()[0]
            #return dist_list
            enum_list = list(enumerate(dist_list))
            enum_list.sort(key=lambda x:x[1])
            #return enum_list
            top_tup_list = enum_list[:k]
            #return top_tup_list
            closest_ind_list = [ind for (ind,dist) in enum_list[0:k]]
            labels_list = list(df_out.loc[closest_ind_list,output_col].values)
            #return labels_list
            sorted_labels = sorted(labels_list,key = labels_list.count,reverse = True)
            #return sorted_labels
            if k == 1:
                df_out.loc[new_pt_ind,output_col] = sorted_labels[0]
            else:
                try:
                    mode = statistics.mode(sorted_labels)
                except:
                    mode = sorted_labels[0]

                finally:
                    df_out.loc[new_pt_ind,output_col] = mode
            return df_out


        for new_point in new_pts_list:
            df_in =  knn_1pt(new_point = new_point,df=df_in).copy()

        end_time = time()
        run_time = end_time - start_time

        mins = run_time // 60
        secs = run_time % 60
        hours = mins // 60
        mins = mins % 60

        print("%s column took %s hr(s), %s min(s), %s sec(s) to be imputed" % (dep_col,hours,mins,secs))    

        if only_pred_df == True:
            return df_in[-nrows_new_pts:]
        return df_in    
    
    
    ### This is to separate the null_value_columns into two separate lists: 1. Categorical column names 2. Discrete column names    
    if null_cols == 'auto_select':
        null_cols = list(df.isna().sum()[df.isna().sum() != 0].sort_values(ascending = False).index)
    elif type(null_cols) != list:
        null_cols = [null_cols]
    null_cat_cols = []
    null_discrete_cols = []
    
    d_unique_in_col = num_unique_values_in_cols(df = df,cols = null_cols)
    for col in null_cols:
        if df[col].dtype.name not in ('int32','int64','float32','float64'):
            null_cat_cols.append(col)
            continue
        else:
            num_nulls = {}
            count_non_nulls_non_zeros = df[col].count() - df[df[col] == 0].count()[col]
            #num_nulls[col] = df.isna().sum()[col]
            if (d_unique_in_col[col] / count_non_nulls_non_zeros) <= thresh_null_cat:
                null_cat_cols.append(col)
            else:
                null_discrete_cols.append(col)
            
    def normalize(df = df,null_cat_cols = null_cat_cols,cols_to_ignore = cols_to_ignore):
        """
        This function takes in a dataframe as a parameter and returns the same dataframe with all the features normalized between 0 and 1 using rescaling (min-max normalization)
        """
        if cols_to_ignore == None:
            cols_to_ignore = []
            
        elif type(cols_to_ignore) == str:
            cols_to_ignore = [cols_to_ignore]

        cols_to_ignore = cols_to_ignore + null_cat_cols
        cols_to_ignore = list(set(cols_to_ignore))

        for col in list(df.columns):
            if col not in cols_to_ignore and df[col].dtype.name not in ('int32','int64','float32','float64'):
                cols_to_ignore.append(col)

        l_min = []
        l_max = []
        desc = df.describe()
        if cols_to_ignore == None:
            for col in df.columns:
                l_min.append(desc[col]['min'])
                l_max.append(desc[col]['max'])

            t_min = list(zip(df.columns, l_min))
            t_max = list(zip(df.columns, l_max))


        else:
            for col in df.drop(cols_to_ignore,axis = 1).columns:
                l_min.append(desc[col]['min'])
                l_max.append(desc[col]['max'])

            t_min = list(zip(df.drop(cols_to_ignore,axis = 1).columns, l_min))
            t_max = list(zip(df.drop(cols_to_ignore,axis = 1).columns, l_max))


        d_min = {}
        for col,val in t_min:
            d_min[col]=val

        d_max = {}
        for col,val in t_max:
            d_max[col]=val

        df_copy = df.copy()
        for key in d_min.keys():
            df_copy[key] = df_copy[key].apply(lambda x: (x - d_min[key])/ (d_max[key] - d_min[key]))

        return cols_to_ignore,df_copy

    cols_to_ignore,norm_df = normalize()
    ### Now that the two lists containing the discrete null columns and the categorical null columns are created, I am going to
    ### impute the categorical columns first using the KNN algorithm
    
    #return null_cat_cols,null_discrete_cols
    def impute_cat_cols(df = df,null_cat_cols = null_cat_cols,k = k,cols_to_ignore= cols_to_ignore):
        new_df = df.copy()
        for col in null_cat_cols:  
            df_w_null = new_df.select_dtypes(include=numerics)
            for ig_col in cols_to_ignore:
                if col != ig_col:
                    try:
                        df_w_null.drop(col,axis = 1,inplace = True)
                    except:
                        pass
            df_w_null[col] = new_df[col]
            df_labeled = df_w_null.dropna()
            if k == None:
                k = int(np.round((df_labeled.count()[col]) ** .5))
            df_unlabeled = df_w_null[df_w_null.isna()[col]].drop(col,axis = 1)
            unlabeled_index = df_unlabeled.index
            df_result = knn(df_labeled,df_unlabeled,dep_col = col,k = k)
            df_result.set_index(unlabeled_index,inplace = True)
            for ind,pred in list(zip(list(unlabeled_index),list(df_result[col]))):
                new_df.loc[ind,col] = pred
        return new_df
    
    ### df_out contains a dataframe where all the categorical columns have no null values but the discrete columns might still have nulls
    df_out = impute_cat_cols()
    
    ### Now that the categorical columns are cleaned, I am going to impute the discrete columns
    def impute_discrete_cols(df_disc = df_out,null_discrete_cols = null_discrete_cols,k=k):
        for col in null_discrete_cols:
            col_mean = np.round(df_disc[col].mean())
            df_disc[col] = df_disc[col].fillna(value=col_mean)
        return df_disc
    
    ### df_clean will have zero null values and will be returned from this overall function
    df_clean = impute_discrete_cols()
    
    func_end_time = time()
    run_time = func_end_time - func_start_time

    mins = run_time // 60
    secs = run_time % 60
    hours = mins // 60
    mins = mins % 60

    print("Imputing took %s hr(s), %s min(s), %s sec(s) long." % (hours,mins,secs))    

    return df_clean
            


In [49]:
def num_unique_values_in_cols(df,cols = None):
    """
    This function takes in a dataframe,df, and a list of cols (can be a string of one object),cols, and return a dictionary
    with each column name in col as a key and the number of unique values in each column as a value
    """
    if type(cols) == str:
        cols = [cols]
    elif cols == None:
        cols = list(df.columns)
    
    d_unique = {}
    for col in cols:
        d_unique[col] = df[col].value_counts().count()
    return d_unique
            

In [50]:
def remove_outliers(df_in,outlier_cols = None,thresh_cat = 0.02,remove = True):
    """
    This function removes outliers from the dataframe input, df_in, for the given columns, outlier_cols.  
    It returns a dataframe with no outliers.  If remove is True, outliers are removed row-wise.  Else, outliers are replace by 
    q3 + 1.5 * IQR if above maximum allowed range or q1 - 1.5 * IQR if below.  thresh_cat is used to determine if a numerical
    column is categorical.  All categorical columns will be ignored in outlier treatment.
    """
    df = df_in.copy()
    if type(outlier_cols) == str:
        outlier_cols = [outlier_cols]
    elif outlier_cols == None:
        outlier_cols = list(df.columns)
    
    desc = df.describe()
    outlier_cols_iter = []
    for col in outlier_cols:
        if df[col].dtype.name in ['int32','int64','float32','float64']:
            num_unique = df[col].value_counts().count()
            num_total = df[col].count()
            if (num_unique / num_total) > thresh_cat:
                outlier_cols_iter.append(col)
              
    for col in outlier_cols_iter:
        q1 = desc.loc['25%',col]
        q3 = desc.loc['75%',col]
        iqr = q3 - q1
        low = q1 - (1.5 * iqr)
        high = q3 + (1.5 * iqr)
        if remove == True:
            df = df[(df[col] >= low) & (df[col] <= high)]
        else:
            def replace_outliers(val,low=low,high=high):
                if val < low:
                    return low
                elif val > high:
                    return high
                else:
                    return val
            df[col] = df[col].apply(replace_outliers)
            
    return df

In [51]:
def remove_high_corr_cols(dataframe,cols_to_use = None,cols_to_avoid = None,num_corr_cols_tolerated = None,max_corr_tolerated = 0.8,num_iterations = 5,type_corr = 'r_squared'):
    """remove_high_corr_cols removes columns in dataframe that are highly correlated with other columns.
    This is to reduce multicollinearity between features which can cause overfitting in a regression model.
    
    Parameters:
    1. dataframe: Pandas DataFrame
    2. cols_to_use: a list of columns which should be used for consideration in determining if multicollinearity is present.
    IMPORTANT: Should be a list of all features and should exclude dependent variable. Default = None
    3. cols_to_avoid: a list of columns which should be included in consideration in determining if multicollinearity is present.
    IMPORTANT: Should be a list of multiple columns (must include dependent variable) or a string containing the dependent variable.
    Default = None
    4. num_corr_cols_tolerated: Maximum tolerated number of correlated columns tolerated in first iteration of inner function, 
    remove_high_corr_cols_one_iter. Default =None.
    5. max_corr_tolerated: Maximum R squared value tolerated between features in cols_to_use list. Default = 0.8. 
    Default set to 5 if type_corr = 'vif'. vif = (1 / (1-R^2)). If R^2 = 0.8, vif = 5.
    6. num_iterations: Number of iterations of inner function remove_high_corr_cols_one_iter. Default = 5
    7. type_corr: Type of correlation coefficient used: Pearson coefficient squared (R^2) or variance inflation factor which is a measure of multicollinearity between features. 
    vif<5 (meaning R2 < 0.8) is a general threshold indicating less multicollinearity. Default = 'r_squared'
    
    """
    
    if num_corr_cols_tolerated <= num_iterations:
        num_corr_cols_tolerated = num_iterations + 1
        
    def remove_high_corr_cols_one_iter(dataframe = dataframe,cols_to_use = cols_to_use,cols_to_avoid = cols_to_avoid,num_corr_cols_tolerated = num_corr_cols_tolerated,max_corr_tolerated = max_corr_tolerated,type_corr = type_corr):
        ### Setting the type_corr variable correctly so that the correct correlation variable is calculated
        if type_corr.lower() in ['vif','variance inflation factor','variance_inflation_factor']:
            type_corr == 'vif'
            if max_corr_tolerated <= 1:
                max_corr_tolerated = 5
        else:
            type_corr = 'r_squared'
            if max_corr_tolerated > 1:
                max_corr_tolerated = 0.8
        
        ### Turning the var cols_to_avoid into a list in case user inputs a single columns in a string format instead of in a list
        if cols_to_avoid != None:
            if type(cols_to_avoid) == str:
                cols_to_avoid = [cols_to_avoid]

        ### Creating df which is a copy of the parameter dataframe so that the original dataframe is not changed
        df = dataframe.copy()
        ### dropping cols_to_avoid from df
        if cols_to_avoid != None:
            df2 = df.drop(cols_to_avoid,axis = 1)
        else:
            df2 = df.copy()

        ### Setting cols_to_use as list of all the columns in df
        if cols_to_use == None:
            cols_to_use = list(df2.columns)
            if cols_to_avoid != None:
                cols_to_use = [col for col in cols_to_use if col not in cols_to_avoid]
            
        else:
            ### Turning the var cols_to_use into a list in case user inputs a single columns in a string format instead of in a list
            if type(cols_to_use) == str:
                cols_to_use = [cols_to_use]
            
            if cols_to_avoid != None:
                cols_to_use = [col for col in cols_to_use if col not in cols_to_avoid]
                
        if num_corr_cols_tolerated == None:
            num_corr_cols_tolerated = int(np.ceil(df2.shape[1]/10))
        
        ### Creating a matrix, high_corr_cols_matrix, which contains a list for each feature which contains the feature name as the 
        ### first element and a list of all highly correlated features as the second element
        high_corr_cols_count_matrix = []
        high_corr_cols_list_dict = {}
        
        for col in cols_to_use:
            if type_corr == 'vif':
                corr_df = (1 / (1-df2[cols_to_use].corr()**2))[[col]].drop(col,axis = 0)
            elif type_corr == 'r_squared':
                corr_df = (df2[cols_to_use].corr()**2)[[col]].drop(col,axis = 0)
            high_corr_cols_count_matrix.append([col,len(list(corr_df[corr_df[col] > max_corr_tolerated].index))])
            ### Creating a dictionary which contains each feature as a key and a list containing all feautures highly correlated to that feature as the value
            high_corr_cols_list_dict[col] = list(corr_df[corr_df[col] > max_corr_tolerated].index)
            
        high_corr_cols_df = pd.DataFrame(high_corr_cols_count_matrix,columns = ['percent_col','num_highly_corr_cols'])
        ### Creates a list of features which have high correlation with only a few other features
        list_low_corr_cols = list(high_corr_cols_df[high_corr_cols_df['num_highly_corr_cols'] <= num_corr_cols_tolerated]['percent_col'].values)
        
        # IMPORTANT: Eliminating only one of the features if a feature has only one other highly correlated feature
        for col in list(high_corr_cols_df[high_corr_cols_df['num_highly_corr_cols'] == 1].index):
            if col in list_low_corr_cols:
                list_low_corr_cols.remove(high_corr_cols_list_dict[col][0])
        
        ### Creating a filterd df, filt_df, which is a df of all the lowly correlated features
        if cols_to_avoid != None:
            filt_df = df[list_low_corr_cols].merge(df[cols_to_avoid],how = 'inner',left_index = True,right_index = True)
        else:
            filt_df = df[list_low_corr_cols]
        return filt_df,list_low_corr_cols
    
    ### First iteration of remove_high_corr_cols_one_iter followed by multiple iterations (accoring to var num_iterations) and
    ### a final iteration where num_corr_cols_tolerated = 1
    dataframe,cols_to_use = remove_high_corr_cols_one_iter()
    if num_iterations > 1 and num_corr_cols_tolerated > num_iterations:
        num_corr_cols_tolerated -= np.ceil(num_corr_cols_tolerated/num_iterations)
        for i in range(num_iterations-2):
            dataframe,cols_to_use = remove_high_corr_cols_one_iter()
    
    dataframe,cols_to_use = remove_high_corr_cols_one_iter(num_corr_cols_tolerated = 0)
        
    return dataframe

In [52]:
def convert_cat(df, cat_cols=None,output_col=None,reset_index = False):
    """
    This function converts all categorical columns into numerical boolean columns.
    
    There are 3 parameters: df, cat_cols, and output_var. 
    
    1. df is the dataframe which needs to have categorical variables converted to numerical variables
    2. cat_cols needs to be a list that contains the names of all categorical columns that need to be converted.
    3. output_var is the name of the output or response variable.  It is set to 'Output' as default."""
    
    if cat_cols == None:
        cat_cols = [col for col in df.columns if df[col].dtype in ['O']]
    
    if output_col != None and output_col in cat_cols:
        cat_cols.remove(output_col)
    cat_cols = list(set(cat_cols))
    
    df_out = df.copy()
    if reset_index == True:
        df_out.reset_index(inplace = True, drop = True)
    
    for col in cat_cols:
        dummy_col = pd.get_dummies(df_out[col],prefix = col+'_',drop_first = True)
        df_out.drop(col,axis=1, inplace = True)

        df_out = df_out.join(dummy_col)

    if output_col != None:
        loc_df_out = list(df_out.columns).index(output_col)
        df_out = df_out[list(df_out.columns[:loc_df_out]) + list(df_out.columns[loc_df_out + 1:]) + list(df_out.columns[loc_df_out:loc_df_out + 1])]
    return df_out

In [53]:
def combine_dummy_variable_columns(data,dummy_var_cols,name_agg_col,remove_common_word = False,common_word = None):
    """
    This function combines the information stored in dummy variables into one column. It does the opposite function of the 
    convert_cat function and returns a dataframe with the newly added aggreate categorical variable column.
    Parameters:
    1. data: Pandas DataFrame
    
    2. dummy_var_cols: a list of dummy variables column names that are related to common category. 
       NOTE: The summation by row for each row across all columns in this list have to equal 1.
       
    3. name_agg_col: This is the name you want to give for the aggregate column.
    
    4. remove_common_word: This is a boolean variable which if set to True, means that a common word will be eliminated from the 
       final values in the name_agg_col column of resultant dataframe. Default: False
       
    5. common_word: this variable is the common_word that occurs in each of the dummy variable column names. 
       ex: common_word = education_ if dummy_var_cols = ['education_None','education_Bachelors','education_PhD']. Default: None"""
    
    df = data.copy()
    dict_col_inds = {}
    for ind in range(len(dummy_var_cols)):
        if remove_common_word == False:
            dict_col_inds[ind] = dummy_var_cols[ind]
        else:
            if common_word != None:
                try:
                    if dummy_var_cols[ind].split(common_word)[0] == '':
                        dict_col_inds[ind] = dummy_var_cols[ind].split(common_word)[-1]
                    else:
                        dict_col_inds[ind] = dummy_var_cols[ind].split(common_word)[0]
                except:
                    print("Wrong common_word. Please choose a correct common_word for dummy variable columns which have a \
                          common word in their name in the same location (in the beginning or end of column name)")
            
    def combine_columns(cols):
        value_ind = list(cols).index(1)
        return dict_col_inds[value_ind]
    
    df[name_agg_col] = df[dummy_var_cols].apply(combine_columns,axis = 1)
    return df

In [54]:
def normalize(df,cols_to_ignore = None,target_col = None):
    if target_col != None and cols_to_ignore!= None:
        if target_col not in cols_to_ignore:
            cols_to_ignore.append(target_col)
    
    """
    This function takes in a dataframe as a parameter and returns the same dataframe with all the features
    normalized between 0 and 1 using rescaling (min-max normalization)
    Defaults:
    cols_to_ignore = None
    target_col = None
    """
    
    if cols_to_ignore == None:
        cols_to_ignore = []
        if target_col != None and target_col not in cols_to_ignore:
            cols_to_ignore.append(target_col)
        
    for col in list(df.columns):
        if col not in cols_to_ignore and df[col].dtype.name not in ('int32','int64','float32','float64'):
            cols_to_ignore.append(col)
            
    l_min = []
    l_max = []
    desc = df.describe()
    if cols_to_ignore == None:
        for col in df.columns:
            l_min.append(desc[col]['min'])
            l_max.append(desc[col]['max'])
        
        t_min = list(zip(df.columns, l_min))
        t_max = list(zip(df.columns, l_max))
 

    else:
        for col in df.drop(cols_to_ignore,axis = 1).columns:
            l_min.append(desc[col]['min'])
            l_max.append(desc[col]['max'])

        t_min = list(zip(df.drop(cols_to_ignore,axis = 1).columns, l_min))
        t_max = list(zip(df.drop(cols_to_ignore,axis = 1).columns, l_max))
    
   
    d_min = {}
    for col,val in t_min:
        d_min[col]=val
    
    d_max = {}
    for col,val in t_max:
        d_max[col]=val
    
    df_copy = df.copy()
    for key in d_min.keys():
        df_copy[key] = df_copy[key].apply(lambda x: (x - d_min[key])/ (d_max[key] - d_min[key]))
    
    return df_copy

In [55]:
def split_df(df,test_size = 0.3,df_to_return = 'df_train'):
    """
    This function takes in a Pandas DataFrame and returns a 
    dataframe that is a subset of that Pandas DataFrame.
    
    There are 3 parameters: df, test_size, and df_to_return
    
    df needs to be a Pandas DataFrame and is the superset dataframe to be divided.
    test_size is the proportion of the dataframe you want to be the testing dataset.
    test_size is set to 0.3 by default.
    df_to_return needs to specified as either 'df_train' or df_test' 
    to return the correct subset dataframe. df_to_return is set to 'df_train' by default
    """
    split_num = int(df.count()[0] * (1-test_size) // 1)
    df_train = df.iloc[:split_num,:]
    df_test = df.iloc[split_num:,:]
    if df_to_return in ['df_train','train']:
        return df_train
    elif df_to_return in ['df_test','test']:
        return df_test

In [56]:
def shuffled_split_dfs(df,test_size = 0.3,reset_index = True):
    """
    This function takes in a Pandas DataFrame and returns a list of 2
    dataframes.  The first dataframe is the train and the second is the test df.
    
    There are 3 parameters: df and test_size
    
    1. df needs to be a Pandas DataFrame and is the superset dataframe to be divided.
    2. test_size is the proportion of the dataframe you want to be the testing dataset.
    test_size is set to 0.3 by default
    3. reset_index determines whether the index of the resultant dataframes should be reset. Default is True
    """
    df_copy = df.copy()
    if reset_index == True:
        df_copy = df_copy.sample(frac = 1).reset_index(drop = True)
    else:
        df_copy = df_copy.sample(frac = 1)       
    split_num = int(df_copy.count()[0] * (1-test_size) //1)
    df_train = df_copy.iloc[:split_num,:]
    df_test = df_copy.iloc[split_num:,:]
    return ([df_train,df_test])

In [57]:
def check_accuracy(df = None,pred_df = None, test_df = None, algo = 'lin',target_class = None):
    import numpy as np
    """
    This function takes in a pandas DataFrame and returns the accuracy of the model
    
    There are 5 parameters: df and algo
    
    1. df needs to be a Pandas DataFrame and algo is the algorithm used.
    2. pred_df is the prediction dataframe used for the knn algorithm
    3. test_df is the test dataframe used for the knn algorithm
    4. algo is set to 'lin' by default but can also be specified as 'log' or 'knn'
    5. target_class is the output_variable
    """
    
    if algo == 'lin':
        df_out = df.copy()
        df_out['error'] = df.iloc[:,-2] - df.iloc[:,-1]
        
        ME = df_out['error'].mean()
        MAE = np.abs(df_out['error']).mean()
        MSE = (df_out['error'] ** 2).mean()
        RMSE = (sum(df_out['error']**2)/df_out.count()[0]+1) ** 0.5
        SSres = (df_out['error']**2).sum()
        SStot = ((df.iloc[:,-2] - df.iloc[:,-2].mean())**2).sum()
        r_sqd = 1 - (SSres / SStot)
        
        return {'ME':ME,'MSE':MSE,'MAE':MAE,'RMSE':RMSE,'r_sqd' : r_sqd}
    
    elif algo == 'log':
        from sklearn.metrics import confusion_matrix,classification_report
        if target_class == None:
            target_class = df.columns[-4]
        print(confusion_matrix(df[target_class].values,df['Crisp'].values))
        print(classification_report(df[target_class].values,df['Crisp'].values))
    
        return sum(df['Correct?']/df.count()[0])
    
    elif algo == 'knn':
        pred = pred_df[[target_class]]
        test = test_df[[target_class]]
        pred[target_class] = pred[target_class].apply(lambda val: int(round(val)))
        test[target_class] = test[target_class].apply(lambda val: int(round(val)))
                                                      
        if pred_df[target_class].nunique() in [1,2]:
            from sklearn.metrics import confusion_matrix,classification_report
            print(confusion_matrix(test,pred))
            print(classification_report(test,pred))
        #return sum(pred[target_class] == test[target_class])
        print("Accuracy = %s percent" %  (100 * sum(pred[target_class] == test[target_class]) / pred.count()[0]))

### Visualization Functions

In [74]:
def barplot_absolute(dataset,dep_col,ind_col,hue_col = None, is_aggregated_bool = False, aggregate_func = None, magnitude = None,ind_cols_order = None, plot_palette = 'mako', plt_figsize_x = 22, plt_figsize_y = 14, ymin = 0,y_max_multiplier = 1.3,num_decimals = 1, annotate_percentages = False, perc_annot_height_mult = 0.05, perc_annot_num_decimals = 1, plot_title_text = None, plot_title_fsize = 30, xlabel_text = None, xlabel_fsize = 22, ylabel_text = None, ylabel_fsize = 22, x_tick_fsize = 15, y_tick_fsize = 18, xtick_rotation = 65, ytick_rotation = 0, annot_fsize = 12, legend_fsize = 15, hlinewidth = 1, hlinecolor1 = 'black', hlinecolor2 = 'black'):
    plt_df = dataset.copy()
    overall_average = plt_df[dep_col].mean()
    ### Need to aggregate data if it is not already so that there are not vertical lines in the barplot columns
    if is_aggregated_bool == False:
        if aggregate_func.lower() in ['sum','add','addition','summation']:
            plt_df = plt_df.groupby(ind_col).sum().reset_index()
        elif aggregate_func.lower() in ['count']:
            plt_df = plt_df.groupby(ind_col).count().reset_index()
        elif aggregate_func.lower() in ['std','standard deviation','stdev']:
            plt_df = plt_df.groupby(ind_col).std().reset_index()
        else:
            plt_df = plt_df.groupby(ind_col).mean().reset_index()
    else:
        pass
    
    if magnitude != None:
        if magnitude.lower() in ['single','ones','single digit']:
            magnitude = 1
        elif magnitude.lower() in ['thousand','thousands']:
            magnitude = 1000
        elif magnitude.lower() in ['million','millions']:
            magnitude = 1000000
            
    else:
        if plt_df[dep_col].max() < 1000:
            magnitude = 1
        elif plt_df[dep_col].max() < 1000000 and plt_df[dep_col].max() >= 1000:
            magnitude = 1000
        elif plt_df[dep_col].max() < 1000000000 and plt_df[dep_col].max() >= 1000000:
            magnitude = 1000000
    mag_dict = {1:'', 1000:' in Thousands', 1000000:' in Millions'}
    # Defining the plot size
    plt.figure(figsize=(plt_figsize_x, plt_figsize_y))
    
    # Defining the values for x-axis, y-axis
    # and from which dataframe the values are to be picked
    
    ### Changing the values of the dependent column based on magnitude variable so large values can be more easily readable.
    plt_df[dep_col] = (plt_df[dep_col]/magnitude).round(decimals = 6)
    if hue_col == None:
        if ind_cols_order == None:
            plots = sns.barplot(palette = plot_palette,x=ind_col, y=dep_col, data = plt_df)
        else:
            plots = sns.barplot(palette = plot_palette,x=ind_col, y=dep_col, data = plt_df, order = ind_cols_order)
    else:
        # DONT RUN: plt_df = plt_df.merge(plt_df[[hue_col]],how = 'inner',left_index = True,right_index = True).reset_index()
        if ind_cols_order == None:
            plots = sns.barplot(palette = plot_palette,x=ind_col, y=dep_col, data = plt_df,hue = hue_col,dodge = False)
        else:
            plots = sns.barplot(palette = plot_palette,x=ind_col, y=dep_col, data = plt_df,hue = hue_col,dodge = False, order = ind_cols_order)

    # Iterrating over the bars one-by-one
    for bar in plots.patches:

      # Using Matplotlib's annotate function and passing the coordinates where the annotation shall be done
      # x-coordinate: bar.get_x() + bar.get_width() / 2
      # y-coordinate: bar.get_height()
      # free space to be left to make graph pleasing: (0, 8)
      # ha and va stand for the horizontal and vertical alignment
        plots.annotate(format(bar.get_height(), '.%sf' % num_decimals),
                       (bar.get_x() + bar.get_width() / 2,
                        bar.get_height()), ha='center', va='center',
                       size= annot_fsize, xytext=(0, 8),
                       textcoords='offset points')
                       
        if annotate_percentages == True:
            plots.annotate("(%s%%)" % (format(100*bar.get_height()/plt_df.groupby(ind_col).mean()[dep_col].sum(), '.%sf' % perc_annot_num_decimals)),
                           (bar.get_x() + bar.get_width() / 2,
                            bar.get_height() + perc_annot_height_mult*plt_df[dep_col].max()), ha='center', va='center',
                           size= annot_fsize, xytext=(0, 8),
                           textcoords='offset points')

    # Setting the label for x-axis
    if xlabel_text == None:
        plt.xlabel("%s" % (ind_col[0].upper()+ind_col[1:]), size= xlabel_fsize)
    else:
        plt.xlabel(xlabel_text, size = xlabel_fsize)
    plt.xticks(rotation = xtick_rotation,size = x_tick_fsize)

    # Setting the label for y-axis
    if ylabel_text == None:
        plt.ylabel("%s%s" % (' '.join([word[0].upper() + word[1:] for word in dep_col.split('_')]),mag_dict[magnitude]), \
                   size= ylabel_fsize)
    else:
        plt.ylabel("%s%s" % (ylabel_text, mag_dict[magnitude]), size = ylabel_fsize)
    plt.yticks(rotation = ytick_rotation, size = y_tick_fsize)
    plt.ylim(ymin,plt_df[dep_col].max()*y_max_multiplier)
    
    ### Creating horizontal lines, one with one with overall average of all data points separately, and one with the average giving each category's average equal weightage
    ind_col_text_clean = ' '.join(ind_col.split('_'))
    
    plt.axhline(y = overall_average, linewidth = hlinewidth, color = hlinecolor1,label = "Overall Average (%s)" % \
                (format((overall_average),'.%sf' % num_decimals)))
    
    plt.axhline(y = plt_df[dep_col].mean(), linewidth = hlinewidth, color = hlinecolor2, linestyle = '--',label = "Average of %s Categories (%s)" % \
                (ind_col_text_clean[0].upper()+ind_col_text_clean[1:], format((plt_df[dep_col].mean()),'.%sf' % num_decimals)))
    
    
    plt.legend(loc = 'upper left',fontsize = legend_fsize)
    # Setting the title for the graph
    if plot_title_text == None:
        plt.title("%s by %s" % (' '.join([word[0].upper() + word[1:] for word in dep_col.split('_')]),\
                                (ind_col[0].upper() + ind_col[1:])), plot_title_fsize)
    else:
        plt.title(plot_title_text, size = plot_title_fsize)
    plt.tight_layout()
    # Finally showing the plot
    plt.show()

In [59]:
def barplot_percent(dataset,dep_col,ind_col,total_col,hue_col = None):
    # Defining the plot size
    plt_df = dataset.copy()
    plt.figure(figsize=(26, 14))
    plt_df = plt_df.reset_index()

    # Defining the values for x-axis, y-axis
    # and from which dataframe the values are to be picked
    if hue_col == None:
        plots = sns.barplot(data = plt_df,x=ind_col, y=dep_col,palette = 'mako')
    else:
        plots = sns.barplot(data = plt_df,x=ind_col, y=dep_col,hue = hue_col,palette = 'mako',dodge = False)
    
    # Iterrating over the bars one-by-one
    for bar in plots.patches:
        plots.annotate(format(bar.get_height(), '.2f'),
                       (bar.get_x() + bar.get_width() / 2,
                        bar.get_height()), ha='center', va='center',
                       size=12, xytext=(0, 8),
                       textcoords='offset points')
    
    # Setting the label for x-axis
    plt.xlabel("%s" % (ind_col[0].upper() + ind_col[1:]), size=24)
    plt.xticks(rotation = 65,size = 12)

    # Setting the label for y-axis
    plt.ylabel(' '.join([word[0].upper() + word[1:] for word in dep_col.split('_')]), size=24)
    plt.yticks(size = 18)
    plt.ylim(0,plt_df[dep_col].max()*1.3)

    plt.axhline(y = plt_df[dep_col].mean(),color = 'black',label = "%s Average\n(%s percent)" % (ind_col[0].upper() + ind_col[1:],format((plt_df[dep_col].mean()), '.2f')))
    plt.axhline(y = plt_df[[total_col,dep_col]].apply(lambda cols: (cols[0]*cols[1])/plt_df[total_col].sum(),axis = 1)\
                .sum(),color = 'blue',label = "Overall Average\n(%s percent)" % format(plt_df[[total_col,dep_col]].apply(lambda cols: (cols[0]*cols[1])/plt_df[total_col].sum(),axis = 1).sum(), '.2f'))
    plt.legend(loc = 'upper left',fontsize = 15)


    # Setting the title for the graph
    plt.title("%s by %s" % ((' '.join([word[0].upper() + word[1:] for word in dep_col.split('_')])),ind_col[0].upper() + ind_col[1:]),size = 30)
    plt.tight_layout()
    # Finally showing the plot
    plt.show()

In [None]:
def create_pie_plot(dataset, ind_col, plot_title, legend_title, dep_col = None, agg_func_name = 'count', annotate_percentages = False, is_dollar_value = False, plt_figsize_x = 8, plt_figsize_y = 8, plot_title_fsize = 18, legend_text_fsize = 15, legend_title_fsize = 15, annot_fsize = 20, annot_text_color = 'w',plot_colors_list = ['xkcd:aquamarine','darkblue','xkcd:navy blue','xkcd:blue','teal','xkcd:lightblue','lightblue','xkcd:turquoise','xkcd:azure','xkcd:teal']):
    fig, ax = plt.subplots(figsize=(10, 10), subplot_kw=dict(aspect="equal"))
    df = dataset.copy()
    if agg_func_name.lower() in ['mean','avg','average']:
        df = df.groupby(ind_col).mean()[dep_col]

    elif agg_func_name.lower() in ['sum','summation','addition']:
        df = df.groupby(ind_col).sum()[dep_col]

    else:
        df = df.groupby(ind_col).count()[df.columns[0]]

    def return_annotation_with_perc(pct, allvals):
        absolute = int(np.round(pct/100.*np.sum(allvals)))
        if is_dollar_value == False:
            return "{:.1f}%\n({:d})".format(pct, absolute)
        else:
            absolute = ("${:,}".format(absolute))
            return "{:.1f}%\n({:s})".format(pct, absolute)
        
    def return_annotation(pct, allvals):
        absolute = int(np.round(pct/100.*np.sum(allvals)))
        if is_dollar_value == False:
            return "({:d})".format(absolute)
        else:
            absolute = ("${:,}".format(absolute))
            return "({:s})".format(absolute)    

    if annotate_percentages == True:
        wedges, texts, autotexts = ax.pie(df, autopct=lambda pct: return_annotation_with_perc(pct, df), textprops=dict(color=annot_text_color),\
                                          colors = plot_colors_list)
        
    else:
        wedges, texts, autotexts = ax.pie(df, autopct=lambda pct: return_annotation(pct, df), textprops=dict(color=annot_text_color),\
                                          colors = plot_colors_list)
        
    pie_legend = ax.legend(wedges, df.index,
              title=legend_title,
              loc="best",
              bbox_to_anchor=(1, 0, 0.5, 1),fontsize = legend_text_fsize)

    plt.setp(pie_legend.get_title(),fontsize = legend_title_fsize)
    plt.setp(autotexts, size=annot_fsize, weight="bold")

    ax.set_title(plot_title,fontsize = plot_title_fsize,loc = 'center')

    plt.show()

# Linear Regression and Logistic Regression

In [60]:
def stoch_grad_desc(dataset,output_col,cols_to_ignore = None,alpha = 0.1,epoch = 10,algo = 'lin',initial_coeffs = 1, bias_coeff = None):
    
    # Importing time module to time program
    from time import time
    start_time = time()
    
    # For linear regression gradient descent
    # coeffs = list((coeffs +  (alpha * feat_df.loc[row,:].transpose() * (dataset_1.loc[row,output_col] - pred)).transpose()).values)

    # For logistic regression gradient descent
    # coeffs = list((coeffs +  (alpha * feat_df.loc[row,:].transpose() * (dataset_1.loc[row,output_col] - pred)).transpose()).values)
    """
    This function returns a list of the coefficients for the specified algorithm.  
    Currently, this function only performs Linear and Logistic Regression.
    
    The 7 parameters are: dataset, output_col, cols_to_ignore, alpha, epoch, algo, and initial_coeffs
    
    IMPORTANT: DATAFRAME CANNOT HAVE NULL VALUES.  FEATURES SHOULD BE NORMALIZED.  INDEX VALUES SHOULD BE CONTINUOUS INTEGERS (EX 0, 1, 2). IF ANY OF THESE CONDITIONS ARE NOT MET, COEFFICIENTS WILL BE NULLS
    
    1. dataset needs to be a pandas DataFrame. No Default
    2. output_col is the target column name as a string.  No Default
    3. cols_to_ignore is a list of columns that the model will ignore including unwanted features and categorical features.
       Default is None object
    4. alpha is the alpha value used in stochiastic gradient descent.  Default is 0.1.
    5. epoch is the number of iterations through each row in the dataset algorithm will perform.  Default is 10.
    6. algo is the specific algorithm to be used.  algo is 'lin' by default for Linear Regression but can also be specified
       as 'log' for Logistic Regression
    7. initial_coeffs is the value user wants all the coeffs to be initialized to.  Default is 1.
    8. bias_coeff is the y_intercept or the bias coefficient.  It is the value of y when all features have a value of 0.  It is set at None
       meaning there is no set bias coefficient but it can be set to any floating point or integer number.
    """

    loc_output_col = list(dataset.columns).index(output_col)
    dataset = dataset[list(dataset.columns[:loc_output_col]) + list(dataset.columns[loc_output_col+1:]) + list(dataset.columns[loc_output_col:loc_output_col + 1])]
    
    from math import exp
    count_rows = dataset.count()[1]

    if cols_to_ignore != None:
        dataset_1 = pd.DataFrame(pd.Series(np.ones(dataset.count()[0])),columns = ['X0']).join(dataset.drop(cols_to_ignore,axis = 1))
    else:
        dataset_1 = pd.DataFrame(pd.Series(np.ones(dataset.count()[0])),columns = ['X0']).join(dataset)
    
    feat_df = dataset_1.drop(output_col,axis = 1)
    num_features = len(dataset_1.columns[0:-1])
    coeffs = list(np.ones(num_features))
    coeffs = [i * initial_coeffs for i in coeffs]
    

    df_columns = list(dataset_1.columns)
    df_input_cols = df_columns
    df_input_cols.remove(output_col)

    for ep in range(epoch):
        # Since this is stochastic gradient descent, the coefficients are updated after each row iteration
        for row in range(count_rows):
            y = dataset_1.loc[row,output_col]
            
            output_terms = []

            for col in df_input_cols:
                output_terms.append((coeffs[dataset_1.columns.get_loc(col)],dataset_1.loc[row,col]))

            output_list = [(x*y) for (x,y) in output_terms]
            
            output = sum(output_list)
            
            if algo == 'lin':
                pred = output
                
                # When number of features exceeds 26, the matrix implementaition is faster.  Otherwise the for loop 
                # implementation is faster
                if bias_coeff != None and type(bias_coeff) in (int,float):
                    coeffs[0] = bias_coeff
                    
                if num_features > 26:
                    coeffs = list((coeffs +  (alpha * feat_df.loc[row,:].transpose() * (dataset_1.loc[row,output_col] - pred)).transpose()).values)
                else:
                    for i in range(len(coeffs)):
                        coeffs[int(i)] += alpha * (dataset_1.loc[row,output_col] - pred) * dataset_1.iloc[row,int(i)]
                        
                if bias_coeff != None and type(bias_coeff) in (int,float):
                    coeffs[0] = bias_coeff
            elif algo == 'log':
                pred = 1 / (1 + exp(-output))
                
                # When number of features exceeds 35, the matrix implementaition is faster.  Otherwise the for loop 
                # implementation is faster
                
                if num_features > 35:
                    coeffs = list((coeffs + alpha * (y - pred) * pred * (1 - pred) * (feat_df.loc[row,:])).values)
                else:
                    for i in range(len(coeffs)):
                        coeffs[i] = coeffs[i] + alpha * (y - pred) * pred * (1 - pred) * dataset_1.iloc[row,i]
                        
    ### This is to print how long the program took to run              
    end_time = time()
    run_time = end_time - start_time

    mins = run_time // 60
    secs = run_time % 60
    hours = mins // 60
    mins = mins % 60
    
    print("Program took %s hr(s), %s min(s), %s sec(s) to run" % (hours,mins,secs))
    
    # Returns a list of all the coeffs (length should be number of features + 1 due to B0)
    return coeffs

In [61]:
def make_predictions(dataset, coeff_list, output_col,cols_to_ignore = None,algo = 'lin',log_reg_thresh = 0.5):
    """
    This function takes in a pandas DataFrame and a list that contains 
    coefficients for the specified algorith used in the stoch_grad_desc function
    and returns the same dataset (with the addition of the first column being 1s to
    represent X0 in the regression formula) plus a new column at the end, 'Prediction'.
    
    There are 4 parameters: dataset, coeff_list, algo, and log_reg_thresh
    
    1. dataset is the dataframe to used to make predictions dataset needs to be a Pandas DataFrame
    2. coeff_list should be the list that was the result of running the stoch_grad_desc function
    3. algo is the specific algorithm to be used.  algo is set to 'lin' by default but can be set to 'log'
    4. log_reg_thresh is the cut-off point for a positive prediction for the logistic regression model"""
    
    loc_output_col = list(dataset.columns).index(output_col)
    dataset = dataset[list(dataset.columns[:loc_output_col]) + list(dataset.columns[loc_output_col+1:]) + list(dataset.columns[loc_output_col:loc_output_col + 1])]
    dataset_index = dataset.index
    
    if type(cols_to_ignore) == list and cols_to_ignore != None:
        df_ignored_cols = dataset.loc[:,cols_to_ignore]
    elif type(cols_to_ignore) != list and cols_to_ignore != None:
        df_ignored_cols = dataset.loc[:,[cols_to_ignore]]
    if cols_to_ignore != None:
        df_ignored_cols.reset_index(inplace = True)
    
    from math import exp
    
    dataset.reset_index(inplace = True, drop = True)
    
    if cols_to_ignore != None:
        dataset_out = pd.DataFrame(pd.Series(np.ones(dataset.count()[0]))).join(dataset.drop(cols_to_ignore,axis = 1))
    else:
        dataset_out = pd.DataFrame(pd.Series(np.ones(dataset.count()[0]))).join(dataset)

    
    dataset_out.rename(mapper = {0:'X0'},axis = 1, inplace = True)
    
    coeffs = coeff_list
    pred = []
    
    for row in range(dataset_out.count()[0]):
        output_terms = []
        for col in dataset_out.columns[0:-1]:
            output_terms.append((coeffs[dataset_out.columns.get_loc(col)],dataset_out.loc[row,col]))
        output_list = [x*y for (x,y) in output_terms]
        
        output = sum(output_list)
        if algo == 'lin':
            pred.append(output)
        
        elif algo == 'log':
            pred.append(1/(1 + exp(-output)))
    dataset_out = dataset_out.join(pd.DataFrame(pred))
    dataset_out.rename(mapper = {0: 'Prediction'},axis = 1, inplace = True)
    
    if algo == 'log':
        dataset_out['Crisp'] = dataset_out['Prediction'].apply(lambda predi: 1 if predi >= log_reg_thresh else 0)
        dataset_out['Correct?'] = dataset_out.iloc[:,-3] == dataset_out['Crisp']
    
    if cols_to_ignore != None:
        dataset_out = pd.concat([df_ignored_cols,dataset_out],axis = 1)
    dataset_out.drop('X0',axis = 1,inplace = True)
    
    dataset_out.set_index(dataset_index,inplace = True)
    return dataset_out

# K Nearest Neighbors

In [62]:
def knn(df_train,new_pts_list,dep_col,k = 7,only_pred_df = True):
    """
    This function performs a K Nearest Neighbors Algorithm with df_train being the labeled dataframe and new_pts_list being the 
    unlabeled dataframe.  dep_col is the target_variable name, k is the number of neighbors.  If only_pred_df is true,
    only the predictions will be outputted as a dataframe.  If it is false, the entire dataframe inputted will be returned
    with values for the predictions in the same dataframe
    Defaults:
    k = 7
    only_pred_df = True"""
    
    from time import time
    start_time = time()
    import statistics
    import numpy as np
    
    nrows_new_pts = new_pts_list.shape[0]
    loc_dep_col = list(df_train.columns).index(dep_col)
    df_in = df_train[list(df_train.columns)[0:loc_dep_col] + list(df_train.columns)[loc_dep_col + 1:] + list(df_train.columns)[loc_dep_col:loc_dep_col + 1]] 
    
    if type(new_pts_list) == type(df_in):
        l_list_new_pts = []
        
        for rn in range(new_pts_list.count()[0]):
            l_list_new_pts.append(list(new_pts_list.iloc[rn,:]))
        new_pts_list = l_list_new_pts

    def knn_1pt(df,new_point,output_col = dep_col,k = k):
        
        df1 = df.copy()
        new_pt_df = pd.DataFrame(data = [new_point + ['DK']],columns=df1.columns)
        #return new_pt_df
        df_out = df1.append(new_pt_df,ignore_index = True)
        #return df_out

        df2 = df_out.drop(output_col,axis=1)
        #return df2
        count_rows = df2.count()[0]
        new_pt_ind = count_rows - 1
        
        dist_list = np.square((np.matrix(df2[:new_pt_ind]) - np.array(df2[new_pt_ind:]))).sum(axis = 1).transpose()[0].tolist()[0]
        #return dist_list
        enum_list = list(enumerate(dist_list))
        enum_list.sort(key=lambda x:x[1])
        #return enum_list
        top_tup_list = enum_list[:k]
        #return top_tup_list
        closest_ind_list = [ind for (ind,dist) in enum_list[0:k]]
        labels_list = list(df_out.loc[closest_ind_list,output_col].values)
        #return labels_list
        sorted_labels = sorted(labels_list,key = labels_list.count,reverse = True)
        #return sorted_labels
        if k == 1:
            df_out.loc[new_pt_ind,output_col] = sorted_labels[0]
        else:
            try:
                mode = statistics.mode(sorted_labels)
            except:
                mode = sorted_labels[0]
        
            finally:
                df_out.loc[new_pt_ind,output_col] = mode
        return df_out
        
    
    for new_point in new_pts_list:
        df_in =  knn_1pt(new_point = new_point,df=df_in).copy()
    
    end_time = time()
    run_time = end_time - start_time

    mins = run_time // 60
    secs = run_time % 60
    hours = mins // 60
    mins = mins % 60
    
    print("Program took %s hr(s), %s min(s), %s sec(s) to run" % (hours,mins,secs))    
    
    if only_pred_df == True:
        return df_in[-nrows_new_pts:]
    return df_in

In [63]:
# def knn_1pt(df,new_point,output_col,k = 3):
#     import statistics
#     df1 = df.copy()
#     new_pt_df = pd.DataFrame(data = [new_point + ['DK']],columns=df.columns)
#     df_out = df1.append(new_pt_df,ignore_index = True)
    
#     df2 = df_out.drop(output_col,axis=1)
#     count_rows = df2.count()[0]
#     new_pt_ind = count_rows - 1
#     df2['sum_sqrd_diffs'] = 0
#     for row_num in range(0,count_rows):
#         sum_sqrd_diffs = 0
#         for col_num in range(0,len(df2.columns)):
#             sum_sqrd_diffs += (df2.iloc[new_pt_ind,col_num] - df2.iloc[row_num,col_num])**2
#         df_out.loc[row_num,'sum_sqrd_diffs'] = sum_sqrd_diffs 
       
#     df_out.loc[new_pt_ind,output_col] = statistics.mode(df_out.iloc[:new_pt_ind,:].sort_values('sum_sqrd_diffs').head(k)[output_col])
#     #return df_out.sort_values('sum_sqrd_diffs').head(10)
#     return df_out
              
    

In [64]:
# def knn_old(df_train,new_pts_list,dep_col,k = 7,only_pred_df = True):
#     from time import time
#     start_time = time()
#     import statistics
    
#     nrows_new_pts = new_pts_list.shape[0]
#     loc_dep_col = list(df_train.columns).index(dep_col)
#     df_in = df_train[list(df_train.columns)[0:loc_dep_col] + list(df_train.columns)[loc_dep_col + 1:] + list(df_train.columns)[loc_dep_col:loc_dep_col + 1]] 
    
#     if type(new_pts_list) == type(df_in):
#         l_list_new_pts = []
        
#         for rn in range(new_pts_list.count()[0]):
#             l_list_new_pts.append(list(new_pts_list.iloc[rn,:]))
#         new_pts_list = l_list_new_pts

#     def knn_1pt(df,new_point,output_col = dep_col,k = k):
        
#         df1 = df.copy()
#         new_pt_df = pd.DataFrame(data = [new_point + ['DK']],columns=df1.columns)
#         df_out = df1.append(new_pt_df,ignore_index = True)

#         df2 = df_out.drop(output_col,axis=1)
#         count_rows = df2.count()[0]
#         new_pt_ind = count_rows - 1
#         df2['sum_sqrd_diffs'] = 0
#         for row_num in range(0,count_rows):
#             sum_sqrd_diffs = 0
#             for col_num in range(0,len(df2.columns)):
#                 sum_sqrd_diffs += (df2.iloc[new_pt_ind,col_num] - df2.iloc[row_num,col_num])**2
#             df_out.loc[row_num,'sum_sqrd_diffs'] = sum_sqrd_diffs 
            
#         if k == 1:
#             df_out.loc[new_pt_ind,output_col] = statistics.mode(df_out.iloc[:new_pt_ind,:].sort_values('sum_sqrd_diffs').head(k)[output_col])
#             df_in = df_out.drop('sum_sqrd_diffs',axis = 1).copy()
#         else:
#             try:
#                 mode = statistics.mode(df_out.sort_values('sum_sqrd_diffs')[0:k][output_col])
#             except:
#                 closest = list(df_out.sort_values('sum_sqrd_diffs')[0:k][output_col].values)
#                 copy = closest[:]
#                 closest.sort(key = lambda x:copy.count(x))
#                 closest.reverse()
#                 mode = closest[0]
#             df_out.loc[new_pt_ind,output_col] = mode
#             df_in = df_out.drop('sum_sqrd_diffs',axis = 1).copy()
#         return df_in
     
#     for np in new_pts_list:
#         df_in =  knn_1pt(new_point = np,df=df_in).copy()
    
#     end_time = time()
#     run_time = end_time - start_time

#     mins = run_time // 60
#     secs = run_time % 60
#     hours = mins // 60
#     mins = mins % 60
    
#     print("Program took %s hr(s), %s min(s), %s sec(s) to run" % (hours,mins,secs))    
    
#     if only_pred_df == True:
#         return df_in[-nrows_new_pts:]
#     return df_in

# K-Means Clustering

In [65]:
def kmeans(df,k=3,epoch = 1):
    """
    This function performs a KMean Clustering algorithm to cluster on the input dataframe, df.  k is the number of clusters.
    epoch is the number of iterations through the dataset the algorithm will perform.
    Defaults:
    k = 3
    epoch = 1
    """
    import random
    
    def euclid_squared_distance(pt1,pt2):
        return (pt1 - pt2)**2
    
    def select_k_points(dataframe = df,k = k):
        centroid_dataframe = pd.DataFrame(columns = dataframe.columns)
        centroid_indices = []
        for i in range(k):
            while True:
                rand_ind = random.choice(list(dataframe.index))
                if rand_ind not in centroid_indices:
                    centroid_indices.append(rand_ind)
                    break
            centroid_dataframe = centroid_dataframe.append(pd.DataFrame(data = [list(dataframe.loc[rand_ind,:])],columns = dataframe.columns,index = [rand_ind]))
            for row_num,ind in enumerate(list(centroid_dataframe.index)):
                centroid_dataframe.loc[ind,'Cluster'] = int(row_num)
        return centroid_dataframe
    
#     def kmeans_pp(dataframe = df,k = k):
#         centroid_dataframe = pd.DataFrame(columns = dataframe.columns)
#         centroid_indices = []
#         rand_ind = random.choice(list(dataframe.index))
#         centroid_indices.append(rand_ind)
#         centroid_dataframe.loc[rand_ind,:] = dataframe.iloc[rand_ind,:]
        
#         for i in range(k-1):
#             for row in dataframe.index:
#                 sqd_diffs_df = pd.DataFrame(columns = dataframe.columns)
#                 for cent_ind in centroid_indices:                
#                     sqd_diffs_df.loc[cent_ind,:] = (centroid_dataframe.loc[cent_ind,:] - dataframe.loc[row,:]) ** 2
#                     sqd_diffs_df['Sum_Squared_Diffs'] = sqd_diffs_df.sum(axis = 1)
#                 min_sqd_diffs = sqd_diffs_df['Sum_Squared_Diffs'].min()
#                 sqd_diffs_df=sqd_diffs_df[sqd_diffs_df['Sum_Squared_Diffs'] = 8]

            
    df['Cluster'] = np.nan
    
    centroid_df = select_k_points()

    centroid_indices = list(centroid_df.index)
    
    

    def one_iteration_k_means(dataframe = df,centroid_dataframe = centroid_df,cluster_col = 'Cluster'):
        for row in dataframe.index:
            sqd_diffs_df = pd.DataFrame(columns = centroid_dataframe.drop(cluster_col,axis = 1).columns)
            for cent_ind in centroid_indices:
                sqd_diffs_df.loc[cent_ind,:] = (centroid_dataframe.drop(cluster_col,axis = 1).loc[cent_ind,:] - dataframe.drop(cluster_col,axis = 1).loc[row,:]) ** 2 
            sqd_diffs_df['Sum_Sqd_Diffs'] = sqd_diffs_df.sum(axis = 1)
            for row_num,ind in enumerate(list(sqd_diffs_df.index)):
                sqd_diffs_df.loc[ind,'Cluster'] = int(row_num)

            smallest_ssd = sqd_diffs_df['Sum_Sqd_Diffs'].min()

            smallest_ssd_df = sqd_diffs_df[sqd_diffs_df['Sum_Sqd_Diffs'] == smallest_ssd]

            closest_centroid = pd.DataFrame(data = [smallest_ssd_df.loc[smallest_ssd_df.index[0],:].values],columns = list(sqd_diffs_df.columns),index = smallest_ssd_df.index)

            closest_centroid_ind = list(closest_centroid.index)[0]

            df.loc[row,cluster_col] = centroid_dataframe.loc[closest_centroid_ind,cluster_col]

            for row_cent in list(centroid_dataframe.index):
                
                if row_cent == closest_centroid_ind:
                    for col in list(centroid_dataframe.drop(cluster_col,axis = 1).columns):
                        centroid_dataframe.loc[row_cent,col] = (centroid_dataframe.loc[row_cent,col] + df.loc[row,col])/2
                        
        return centroid_dataframe
    
    for iteration in range(epoch):
        centroid_df = one_iteration_k_means() 

    return df

# Classification and Regression Trees

In [66]:
def cart(df):
    feat_1 = df.columns[0]
    feat_2 = df.columns[1]
    target = df.columns[-1]
    def gini(feat,row,dataset = df):
        df_gini = df.copy()
        #return df_gini.loc[1,feat]

        for i in range(df_gini.count()[0]):
            if df_gini.loc[i,feat] < df_gini.loc[row,feat]:
                df_gini.loc[i,'Group'] = 0
            else:
                df_gini.loc[i,'Group'] = 1
        try:
            count_left = df_gini['Group'].value_counts()[0]
        except:
            count_left = 0
        try:
            count_right = df_gini['Group'].value_counts()[1]
        except:
            count_right = 0
        
        count_0_left = df_gini[(df_gini[target] == df_gini['Group']) & (df_gini[target] == 0)].count()[0]
        count_1_left = df_gini[(df_gini[target] != df_gini['Group']) & (df_gini[target] == 1)].count()[0]
        count_0_right = df_gini[(df_gini[target] != df_gini['Group']) & (df_gini[target] == 0)].count()[0]
        count_1_right = df_gini[(df_gini[target] == df_gini['Group']) & (df_gini[target] == 1)].count()[0]

        g_split = ((count_0_left/count_left) * (1 - count_0_left/count_left))+ \
                   ((count_0_right/count_right) * (1 - count_0_right/count_right)) + \
                   ((count_1_left/count_left) * (1 - count_1_left/count_left)) + \
                   ((count_1_right/count_right) * (1 - count_1_right/count_right)) 
        return g_split
    
    l = []
    
    for i in range(0,df.count()[0]):
        gini_split = gini(dataset = df,feat = feat_1,row = i)
        l.append((i,feat_1,df.loc[i,feat_1],gini_split))
        
    for i in range(0,df.count()[0]):
        gini_split = gini(dataset = df,feat = feat_2,row = i)
        l.append((i,feat_2,df.loc[i,feat_2],gini_split))
    l_gini = [i[-1] for i in l]
    gini_min = min(l_gini)
    l_final = list(filter(lambda x: gini_min == x[-1],l))
    
    return l_final

### Seaborn Palettes

In [67]:
seaborn_palettes = ['Accent', 'Accent_r', 'Blues', 'Blues_r', 'BrBG', 'BrBG_r', 'BuGn', 'BuGn_r', 'BuPu', 'BuPu_r', 
                'CMRmap', 'CMRmap_r', 'Dark2', 'Dark2_r', 'GnBu', 'GnBu_r', 'Greens','Greens_r', 'Greys', 'Greys_r', 'OrRd', 
                'OrRd_r', 'Oranges', 'Oranges_r', 'PRGn', 'PRGn_r', 'Paired', 'Paired_r', 'Pastel1',
                'Pastel1_r', 'Pastel2', 'Pastel2_r', 'PiYG', 'PiYG_r', 'PuBu','PuBuGn', 'PuBuGn_r', 
                'PuBu_r', 'PuOr', 'PuOr_r', 'PuRd', 'PuRd_r', 'Purples', 'Purples_r', 'RdBu', 'RdBu_r', 
                'RdGy', 'RdGy_r', 'RdPu', 'RdPu_r', 'RdYlBu', 'RdYlBu_r', 'RdYlGn', 'RdYlGn_r', 'Reds', 
                'Reds_r', 'Set1', 'Set1_r', 'Set2', 'Set2_r', 'Set3', 'Set3_r', 'Spectral', 'Spectral_r', 
                'Wistia', 'Wistia_r', 'YlGn', 'YlGnBu', 'YlGnBu_r', 'YlGn_r', 'YlOrBr', 'YlOrBr_r', 'YlOrRd', 
                'YlOrRd_r', 'afmhot', 'afmhot_r', 'autumn', 'autumn_r', 'binary', 'binary_r', 'bone', 
                'bone_r', 'brg', 'brg_r', 'bwr', 'bwr_r', 'cividis', 'cividis_r', 'cool', 'cool_r', 'coolwarm', 'coolwarm_r', 'copper', 'copper_r',
                'cubehelix', 'cubehelix_r', 'flag', 'flag_r', 'gist_earth', 'gist_earth_r', 'gist_gray', 'gist_gray_r','gist_heat', 'gist_heat_r','gist_ncar', 'gist_ncar_r',
                'gist_rainbow', 'gist_rainbow_r', 'gist_stern', 'gist_stern_r', 'gist_yarg', 
                'gist_yarg_r', 'gnuplot', 'gnuplot2', 'gnuplot2_r', 'gnuplot_r', 'gray', 'gray_r',
                'hot', 'hot_r', 'hsv', 'hsv_r', 'icefire', 'icefire_r', 'inferno', 
                'inferno_r', 'magma', 'magma_r', 'mako','mako_r', 
                'nipy_spectral', 'nipy_spectral_r', 'ocean', 'ocean_r', 'pink', 'pink_r',
                'plasma', 'plasma_r', 'prism', 'prism_r', 'rainbow','rainbow_r',
                'rocket', 'rocket_r', 'seismic', 'seismic_r', 'spring', 'spring_r',
                'summer', 'summer_r', 'tab10','tab10_r', 'tab20', 'tab20_r', 'tab20b',
                'tab20b_r', 'tab20c', 'tab20c_r', 'terrain', 'terrain_r', 'twilight',
                'twilight_r', 'twilight_shifted', 'twilight_shifted_r', 'viridis', 'viridis_r', 'vlag', 'vlag_r', 'winter', 'winter_r']

In [1]:
print("Available functions:\
\n\n\ncreate_random_df(model,nrows,ncols,nclasses,target_multiplier = 1),\
\n\nfind_cols_to_ignore(df,thresh_cat = 0.02,cols_to_ignore = None),\
\n\nimpute_cols(df,k=None,null_cols = 'auto_select',thresh_null_cat = 0.02,cols_to_ignore = None),\
\n\nnum_unique_values_in_cols(df,cols = None),\
\n\nremove_outliers(df_in,outlier_cols = None,thresh_cat = 0.02,remove = True),\
\n\ndef remove_high_corr_cols(dataframe,cols_to_use = None,cols_to_avoid = None,num_corr_cols_tolerated = None,\
\n\tmax_corr_tolerated = 0.8,num_iterations = 5,type_corr = 'r_squared'),\
\n\nconvert_cat(df, cat_cols = None,output_col=None,reset_index = False),\
\n\ncombine_dummy_variable_columns(data,dummy_var_cols,name_agg_col,remove_common_word = False,common_word = None),\
\n\nnormalize(df,cols_to_ignore = None,target_col = None),\
\n\nsplit_df(df,test_size = 0.3,df_to_return = 'df_train'),\
\n\nshuffled_split_dfs(df,test_size = 0.3),\
\n\ncheck_accuracy(df = None,pred_df = None, test_df = None, algo = 'lin',target_class = None),\
\n\nbarplot_absolute(dataset,dep_col,ind_col,hue_col = None, is_aggregated_bool = False, aggregate_func = None, magnitude = None,ind_cols_order = None, plot_palette = 'mako', plt_figsize_x = 22, plt_figsize_y = 14, ymin = 0,y_max_multiplier = 1.3,num_decimals = 1, annotate_percentages = False, perc_annot_height_mult = 0.05, perc_annot_num_decimals = 1, plot_title_text = None, plot_title_fsize = 30, xlabel_text = None, xlabel_fsize = 22, ylabel_text = None, ylabel_fsize = 22, x_tick_fsize = 15, y_tick_fsize = 18, xtick_rotation = 65, ytick_rotation = 0, annot_fsize = 12, legend_fsize = 15, hlinewidth = 1, hlinecolor1 = 'black', hlinecolor2 = 'black'),\
\n\nbarplot_percent(dataset,dep_col,ind_col,total_col,hue_col = None),\
\n\nstoch_grad_desc(dataset,output_col,cols_to_ignore = None,alpha = 0.1,epoch = 10,algo = 'lin',initial_coeffs = 1, bias_coeff = None),\
\n\nmake_predictions(dataset, coeff_list, output_col,cols_to_ignore = None,algo = 'lin',log_reg_thresh = 0.5),\
\n\nknn(df_train,new_pts_list,dep_col,k = 7,only_pred_df = True),\
\n\nkmeans(df,k=3,epoch = 1),\
\n\ncart(df)\
\n\n\nAdditional Info:\
\n\nseaborn_palettes is a list of all seaborn palette names")

Available functions:


create_random_df(model,nrows,ncols,nclasses,target_multiplier = 1),

find_cols_to_ignore(df,thresh_cat = 0.02,cols_to_ignore = None),

impute_cols(df,k=None,null_cols = 'auto_select',thresh_null_cat = 0.02,cols_to_ignore = None),

num_unique_values_in_cols(df,cols = None),

remove_outliers(df_in,outlier_cols = None,thresh_cat = 0.02,remove = True),

def remove_high_corr_cols(dataframe,cols_to_use = None,cols_to_avoid = None,num_corr_cols_tolerated = None,
	max_corr_tolerated = 0.8,num_iterations = 5,type_corr = 'r_squared'),

convert_cat(df, cat_cols = None,output_col=None,reset_index = False),

combine_dummy_variable_columns(data,dummy_var_cols,name_agg_col,remove_common_word = False,common_word = None),

normalize(df,cols_to_ignore = None,target_col = None),

split_df(df,test_size = 0.3,df_to_return = 'df_train'),

shuffled_split_dfs(df,test_size = 0.3),

check_accuracy(df = None,pred_df = None, test_df = None, algo = 'lin',target_class = None),

barplot_absolu