In [None]:

#########################################################################
#                                                                       #
#               <<  FUNCTIONS IN THIS FILE      >>                      #
#                                                                       #
# Short print: p()                                                      #
# Load CSV: load_csv()                                                  #
# Plot data: plot_data()                                                #
# Correlation Matrix: corr_matrix()                                     #
# Top correlated pairs: top_pairs()                                     #
# Random forest model: random_forest()                                  #
# Prediction vs Observation: pred_obs()                                 #
# Interaction term: interaction()                                       #
#                                                                       #
########################################################################


###############################################################################

# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ppscore as pps
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.metrics import mean_absolute_error as MAE
from sklearn.preprocessing import LabelEncoder as LE
from tqdm import tqdm
import warnings
warnings.simplefilter('ignore')

###############################################################################s

# Short print function
def p(*args):
    s = ''
    for a in args:
        s += str(a) + ' '
    print(s)


# Load CSV files
def load_csv(filename):
    try:
        df = pd.read_csv(filename, delimiter=',')
        print('Cargado con éxito (', filename, ')')
        return df
    except Exception as ex:
        print(ex)
        print('No se pudo cargar el archivo!')


# Plot data with a better resolution
# inputs are: x, y, pltype, fig size, legend, title, xlabel, ylabel, color of plot, trend_line, regresion degree*, color of reg line*
# * values are optional (if trend_line is True)
# current accepted plot types: 'linear' plot, 'scatter' plot, 'dashed' plot, 'bar' plot
def plot_data(*args):
    # unpack arguments
    try:
        x = args[0]
        y = args[1]
        pltype = args[2]
        size = args[3]
        leg = args[4]
        title = args[5]
        xlabel = args[6]
        ylabel = args[7]
        color_plot = args[8]
        trend_line = args[9]
    except Exception as ex:
        print('Número incorrecto de inputs')

    if trend_line == True:
        try:
            reg_degree = args[10]
        except:
            reg_degree = 1 # default
        try:
            reg_color = args[11]
        except:
            reg_color = 'purple'

    # plot settings
    fig, ax = plt.subplots(1)
    fig.set_figwidth(size[0])
    fig.set_figheight(size[1])

    # select plot type
    if pltype == 'scatter':
        plt.scatter(x, y, label=leg, color=color_plot)
    elif pltype == 'line':
        plt.plot(x, y, label=leg, linestyle='-', color=color_plot)
    elif pltype == 'dashed':
        plt.plot(x, y, label=leg, linestyle='--', color=color_plot)
    elif pltype == 'bar':
        plt.bar(x, y, label=leg, color=color_plot)

    # trend line
    if trend_line == True:
        z = np.poly1d(np.polyfit(x, y, deg=reg_degree))
        plt.plot(x, z(x), linestyle='-.', color=reg_color)

    # plot display
    plt.xlabel(xlabel, fontsize=16)
    plt.ylabel(ylabel, fontsize=16)
    if leg != '':
        plt.legend(fontsize=16)
    plt.title(title, fontsize=24)

    # tick size
    if pltype == 'scatter' or pltype == 'line':
        for tick in ax.xaxis.get_major_ticks():
            tick.label.set_fontsize(14)
        for tick in ax.yaxis.get_major_ticks():
            tick.label.set_fontsize(14)

    plt.show()



# Correlation Matrix
# name is type of correlation: 'pearson', 'spearman', 'ppscore'
def corr_matrix(df, name):
    if name == 'pearson':
        corrp = df.corr(method='pearson')
        sns.heatmap(corrp, vmin=0, vmax=1, cmap="Blues", linewidths=0.5, annot=True)
    elif name == 'spearman':
        corrs = df.corr(method='spearman')
        sns.heatmap(corrs, vmin=0, vmax=1, cmap="Blues", linewidths=0.5, annot=True)
    elif name ==  'ppscore':
        ppscore = pps.matrix(df)
        sns.heatmap(ppscore[['x', 'y', 'ppscore']].pivot(columns='x', index='y', values='ppscore'),
                vmin=0, vmax=1, cmap="Blues", linewidths=0.5, annot=True)


# Get top correlated pairs
# name is the type of correlation: 'pearson', 'spearman', 'ppscore'
# n is how many correlated pairs should be shown
def top_pairs(df, name, n):
    if name == 'pearson' or name == 'spearman':
        corr_matrix = df.corr(method=name)
        cols = df.columns
        to_drop = set()
        for i in range(df.shape[1]):
            for j in range(i+1):
                to_drop.add((cols[i], cols[j]))
        corr_pairs = corr_matrix.abs().unstack()
        corr_pairs = corr_pairs.drop(labels=to_drop).sort_values(ascending=False)
        print(corr_pairs[:n])
    elif name == 'ppscore':
        ppscore = pps.matrix(df)
        corr_matrix = ppscore[['x', 'y', 'ppscore']]
        print(corr_matrix.sort_values(by='ppscore', ascending=False).reset_index().drop('index', axis=1)
          .drop(index=[i for i in range(len(df.columns))])[:n].reset_index().drop('index', axis=1))


# Random Forest Model
# Regression -> choose True, Classification -> Choose False
# target is the target to predict
# n_est is the number of estimators (can be a list or an integer)
# m_dep is the depth of the tree (can be a list or an integer)
# split is the split between training and validation data (between 0.01 and 0.99)
def random_forest(df, regression, target, n_est, m_dep, split, *criteria):
    # categorical encoding
    cols = df.dtypes
    hot_cols, label_cols, num_cols = [], [], []
    for i in range(len(cols)):
        if cols[i] == 'object':
            if len(df[cols.index[i]].unique()) < 20:
                hot_cols.append(cols.index[i])
            else:
                label_cols.append(cols.index[i])
        else:
            num_cols.append(cols.index[i])

    # label encoding
    label_encoder = LE
    for l in label_cols:
        df[l] = label_encoder.fit_transform(df[l], df[target])

    # one hot encoding
    df = pd.get_dummies(df, columns=hot_cols, prefix='enc_')

    # drop empty rows
    df.dropna(inplace=True)

    # data splitting
    split = round(df.shape[0]*(split))
    x_cols = list(df.columns)
    x_cols.remove(target)

    # training and validation sets
    train_X = df[x_cols][:split]
    train_y = df[target][:split]
    test_X = df[x_cols][split+1:]
    test_y = df[target][split+1:]

    # check for single values
    if type(n_est) != list:
        n_est = [n_est]
    if type(m_dep) != list:
        m_dep = [m_dep]

    # training criterion
    flag = False
    reg_criteria = {'mse', 'mae'}
    cla_criteria = {'gini', 'entropy'}
    if type(criteria) != 'str':
        criteria = str(criteria[0])
    if criteria != None:
        if regression == True:
            p('1', criteria in reg_criteria)
            if criteria in reg_criteria:
                p('2')
                flag = True
        else:
            p('3')
            if criteria in cla_criteria:
                p('4')
                flag = True

    # model training
    if flag == True:
        best_mae = 99999
        best_model = 0
        best_pair = [0,0]
        if regression == True:
            for n in tqdm(n_est):
                for m in m_dep:
                    model = RFR(n_estimators=n, max_depth=m, criterion='mae')
                    model.fit(train_X, train_y)
                    predict_y = model.predict(test_X)
                    mae = MAE(test_y, predict_y)
                    if mae < best_mae:
                        best_model = model
                        best_mae = mae
                        best_pair = [n, m]
        else:
            for n in tqdm(n_est):
                for m in m_dep:
                    model = RFC(n_estimators=n, max_depth=m, criterion='entropy')
                    model.fit(train_X, train_y)
                    predict_y = model.predict(test_X)
                    mae = MAE(test_y, predict_y)
                    if mae < best_mae:
                        best_model = model
                        best_mae = mae
                        best_pair = [n, m]


        # model results
        print('No. of estimators: ', best_pair[0], ' | Max depth of tree: ', best_pair[1])
        print('MAE: ' , best_mae)

        return best_model

    else:
        print('Inadequate criteria for chosen model')
        print('Regression: mse,mae  |  Classifier: gini,entropy')


# Compare model prediction with observed value
def pred_obs(model, df, target, row):
    # get relevant rows
    l = list(df.columns)
    l.remove(target)
    j = row

    x_obs = df[l].reset_index().drop('index',axis=1).iloc[j:j+1],
    y_obs = df[target].iloc[j:j+1]

    y_pred = model.predict(x_obs)
    y_obs = y_obs.values[0]

    while type(y_pred) == np.ndarray:
        y_pred = y_pred[0]

    print('Prediction: ', y_pred, ' | Observed: ', y_obs, ' | Difference: ', abs(y_pred - y_obs))
    print('Percent Error  (%): ', round((y_pred - y_obs)/y_obs*100,2))


# Add interaction terms
# termlist is the list of columns names to include
# agg is the interaction type: 'sum', 'sub', 'mul', 'div'
# name is the name of the new interaction column
def interaction(df, termlist, agg, name):
    if agg == 'sum':
        try:
            interactions = 0
            for term in termlist:
                interactions += df[term]
            df[name] = interactions
        except:
            print('No se pueden combinar los términos')
    if agg == 'sub':
        try:
            interactions = 0
            interactions += df[termlist[0]]
            for term in termlist[1:]:
                interactions -= df[term]
            df[name] = interactions
        except:
            print('No se pueden combinar los términos')
    if agg == 'mul':
        try:
            interactions = 1
            for term in termlist:
                interactions *= df[term]
            df[name] = interactions
        except:
            print('No se pueden combinar los términos.')
    if agg == 'div':
        try:
            interactions = 0
            interactions += df[termlist[0]]
            for term in termlist[1:]:
                interactions /= df[term]
            df[name] = interactions
        except:
            print('No se pueden combinar los términos.')

    return df