In [None]:
import os
import io
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import tensorflow as tf

from tensorflow import keras

from sklearn.base import BaseEstimator, TransformerMixin
from scipy import stats
from sklearn.metrics import precision_recall_curve, roc_curve

from google.colab import files

def connect_kaggle():
  !pip install -q kaggle
  files.upload()
  !mkdir ~/.kaggle
  !cp kaggle.json ~/.kaggle/
  !chmod 600 ~/.kaggle/kaggle.json
  !kaggle datasets list

def download_kaggle(name):
  os.system(f"kaggle competitions download -c '{name}' ")

def submit_kaggle(title, filename, message="No message specified"):
    os.system(f"kaggle competitions submit -c '{title}' -f '{filename}' -m '{message}'")

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

def plot_precision_recall_vs_threshold(true_label, pred, ax=None):
    precisions, recalls, thresholds = precision_recall_curve(true_label, pred)
    if ax is None:
      ax = plt.plot(thresholds, precisions[:-1], "b--", label="Precision", linewidth=2)
      ax.plot(thresholds, recalls[:-1], "g-", label="Recall", linewidth=2)
      ax.set_xlabel("Threshold", fontsize=16)
      ax.legend(loc="upper left", fontsize=16)
      ax.set_ylim([0, 1])

    else:
      ax.plot(thresholds, precisions[:-1], "b--", label="Precision", linewidth=2)
      ax.plot(thresholds, recalls[:-1], "g-", label="Recall", linewidth=2)
      ax.set_xlabel("Threshold", fontsize=16)
      ax.legend(loc="upper left", fontsize=16)
      ax.set_ylim([0, 1])
    
    return ax

def plot_roc_curve(true_label, pred_score, label=None, ax=None):
    fpr, tpr, threshold = roc_curve(true_label, pred_score)
    if ax is None:
      plt.plot(fpr, tpr, linewidth=2)
      plt.plot([0, 1], [0, 1], 'k--')
      plt.axis([0, 1, 0, 1])
      plt.xlabel('False Positive Rate', fontsize=16)
      plt.ylabel('True Positive Rate', fontsize=16)
      return plt.gca()

    else:
      ax.plot(fpr, tpr, linewidth=2)
      ax.plot([0, 1], [0, 1], 'k--')
      ax.axis([0, 1, 0, 1])
      ax.set_xlabel('False Positive Rate', fontsize=16)
      ax.set_ylabel('True Positive Rate', fontsize=16)
      return ax

def display_score(score):
  print("Individual score: ", score)
  print("Mean: ", score.mean())
  print("Standard deviation: ", score.std())

In [None]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix, precision_score, \
                            recall_score, f1_score, precision_recall_curve, \
                            roc_curve, roc_auc_score

def evaluate_model(model, X_train, y_train, num_label, cv=3, proba=False):
  pred = cross_val_predict(model, X_train, y_train, cv=cv)
  pred_confusion = confusion_matrix(y_train, pred)
  index = [i for i in range(num_label)]
  df_conf = pd.DataFrame(pred_confusion, columns=index, index=index)

  if proba:
    pred_proba = cross_val_predict(model, X_train, y_train, cv=cv, method='decision_function')

  print("Metrics ")
  print("-------")
  print("Recall: ", recall_score(y_train, pred))
  print("Precision: ", precision_score(y_train, pred))
  print("F1_score: ", f1_score(y_train, pred))
  print("ROC_AUC Score: ", roc_auc_score(y_train, pred_proba) if proba else "")
  
  print("Cross validation score")
  print("----------------------")
  display_score(cross_val_score(model, X_train, y_train, cv=cv))

  ax1= plt.subplot(2, 2, 1)
  sns.heatmap(df_conf, annot=True, ax=ax1)
  ax1.set_xlabel("Model prediction")
  ax1.set_ylabel("True prediction")
  ax1.set_title("Confusion matrix")

  if proba:
    ax2 = plt.subplot(2, 2, 2)
    plot_precision_recall_vs_threshold(y_train, pred_proba, ax=ax2)

    ax3 = plt.subplot(2, 1, 2)
    plot_roc_curve(y_train, pred_proba, label="ROC_curve", ax=ax3)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

def preprocess_data(data, drop=[], cat_col=[], label=[], test_size = 0.2):
  '''
  Standardize the numerical data,
  and split the data into training and testing dataset.
  '''
  data = data.drop(drop, axis=1)
  num_col = list(set(data.columns) - set(cat_col) - set(label))
  X_train, X_test, y_train, y_test = train_test_split(data.drop(label, axis=1), 
                                                      data[label],
                                                      test_size=test_size)
  std_scaler = StandardScaler()
  pipeline = ColumnTransformer([
                       ("std_scaler", std_scaler, num_col)
  ], remainder="passthrough")
  X_train = pipeline.fit_transform(X_train)
  X_test = pipeline.transform(X_test)
  return (X_train, X_test, y_train, y_test)

In [None]:
class Fill_Na(BaseEstimator, TransformerMixin):
  def __init__(self, filler):
    self.filler = filler

  def fit(self, X, y=None):
    return self

  def transform(self, X):
    if type(X) == np.ndarray:
      X = np.nan_to_num(X, nan=self.filler)

    elif type(X) == pd.DataFrame:
      X = X.fillna(self.filler)
    
    else:
      raise TypeError("Unknown type of X")
    
    return X


class Drop_Col(BaseEstimator, TransformerMixin):
  def __init__(self, col_list):
    self.col_list = col_list
  
  def fit(self, X, y=None):
    return self
  
  def transform(self, X):
    if type(X) != pd.DataFrame:
      raise TypeError(f"Drop Col should take a DataFrame argument, not {type(X)}")
    
    else:
      return X.drop(self.col_list, axis=1)

class Impute(BaseEstimator, TransformerMixin):
  def __init__(self, col, strategy, reference=None):
    self.col = col
    self.strategy = strategy
    self.reference = reference
    self.imputer = SimpleImputer(strategy=self.strategy)
  
  def fit(self, X, y=None):
    if self.reference is None:
      self.imputer.fit(X[[self.col]])
    
    else:
      self.imputer.fit(self.reference[[self.col]])

    return self
  
  def transform(self, X):
    if type(X) != pd.DataFrame:
      raise TypeError(f"Drop Col should take a DataFrame argument, not {type(X)}")
    
    else:
      X[self.col] = self.imputer.transform(X[[self.col]])
      
      return X

In [None]:
import os
import tarfile
import zipfile

def unzip_tar(file_path, target='.'):
  tgz = tarfile.open(file_path)
  tgz.extractall(path=target)
  tgz.close()

def unzip(file_path, target='.'):
  with zipfile.ZipFile(file_path, 'r') as zip_ref:
    zip_ref.extractall(target)

In [None]:
import time

def generate_dir(name="Untitled", date=False):
  if date:
    run_id = time.strftime("run_%Y_%m_%d-%H_%M_%S")
    return os.path.join(TENSORBOARD_DIR, run_id)

  else:
    return os.path.join(TENSORBOARD_DIR, name)


In [None]:
def upload_files(dataframe=False, xlsx=False):
  uploaded = files.upload()
  if dataframe:
    payload = []
    for i in uploaded.keys():
      payload.append(pd.read_csv(io.BytesIO(uploaded[i])))
    
    return payload

  else:
    return uploaded

def locate_outlier(df, columns, zscore_threshold, any=True, exclude=False):
  mask_include = np.abs(stats.zscore(df[columns])) > zscore_threshold
  mask_exclude = np.abs(stats.zscore(df[columns])) < zscore_threshold

  if any:
    if exclude:
      return df[mask_exclude.any(axis=1)]
    else:
      df = df[mask_include.any(axis=1)]
      outlier_field = pd.DataFrame(mask_include, columns=columns)
      outlier_field = outlier_field.apply(lambda x: x.replace(True, x.name).replace(False, ""))
      outlier_field = outlier_field.apply(lambda x: x.str.cat(sep=''), axis=1)
      outlier_field = outlier_field.replace("", np.nan).dropna()
      outlier_field.rename("Outlier_field", inplace=True)
      assert df.index.equals(outlier_field.index)
      return pd.concat([df, outlier_field], axis=1)
  
  else:
    if exclude:
      return df[mask_exclude.all(axis=1)]

    else:
      df = df[mask_include.all(axis=1)]
      outlier_field = pd.DataFrame(mask_include, columns=columns)
      outlier_field = outlier_field.apply(lambda x: x.replace(True, x.name).replace(False, ""))
      outlier_field = outlier_field.apply(lambda x: x.str.cat(sep=''), axis=1)
      outlier_field = outlier_field.replace("", np.nan).dropna()
      outlier_field.rename("Outlier_field", inplace=True)
      assert df.index.equals(outlier_field.index)
      return pd.concat([df, outlier_field], axis=1)


In [None]:
import ipywidgets as widget

class BasicUI(object):
  def __init__(self, tracked_df):
    self._tracked_df = tracked_df
    self._df_select = widget.Dropdown(description="DataFrame", options=tracked_df)
    self._view_option = widget.RadioButtons(description="View", options=["general", "info", "describe", "value_counts"])
    self._max_row_option = widget.IntText(description="Max rows", value=-1)
    self._max_column_option = widget.IntText(description="Max columns", value=-1)
    self._max_colwidth_option = widget.IntText(description="Max col width", value=-1)

  def df_info(self, df, option):
    df = globals()[df]
    if option == "info":
      display(df.info())
    
    elif option == "describe":
      display(df.describe())
    
    elif option == "value_counts":
      display(df.apply(lambda x: x.value_counts()).unstack().dropna().T)
    
    elif option == "general":
      display(df)
    
  def set_max_rows(self, n):
    if n == -1:
      pd.reset_option("max_rows")
    
    else:
      pd.set_option("max_rows", n)

  def set_max_columns(self, n):
    if n == -1:
      pd.reset_option("max_column")
    
    else:
      pd.set_option("max_column", n)
  
  def set_max_colwidth(self, n):
    if n == -1:
      pd.reset_option("max_colwidth")
    
    else:
      pd.set_option("max_colwidth", n)
  
  def run(self):
    general_utilities = widget.interactive(self.df_info, df=self._df_select, option=self._view_option)
    max_row = widget.interactive(self.set_max_rows, n=self._max_row_option)
    max_col = widget.interactive(self.set_max_columns, n=self._max_column_option)
    max_colwidth = widget.interactive(self.set_max_colwidth, n=self._max_colwidth_option)
    general_option = widget.VBox([max_col, max_row, max_colwidth])
    
    main_option = widget.Tab([general_utilities, general_option], 
                               titles=("Utilities", "Option"))
    display(main_option)

In [None]:
def plotter(f):
  def plotter_function(*args, figsize=(12, 12), title='Big title', **kwargs):
    plt.figure(figsize=figsize, tight_layout=True)
    f(*args, **kwargs)
    figure = plt.gcf()
    figure.suptitle(title, fontsize=16, y=1.05)
  return plotter_function

In [None]:
@plotter
def plot_each_col(data, col_list, n_col, plot_type, x=None, **kwargs):
  n_row = len(col_list) // n_col + 1
  for i, col in enumerate(col_list):
    ax = plt.subplot(n_row, n_col, i + 1)
    if plot_type == "hist":
      sns.histplot(data=data, x=col, hue=LABEL, multiple="stack", **kwargs)
    
    elif plot_type == "bar":
      sns.barplot(data=data, x=col, y=LABEL, **kwargs)

    elif plot_type == "count":
      sns.countplot(data=data, x=col, **kwargs)

    elif plot_type == "box":
      sns.boxplot(data=data, x=col, **kwargs)
    
    elif plot_type == "line":
      if x:
        sns.lineplot(data=data, x=x, y=col, ax=ax, **kwargs)

      else:
        sns.lineplot(data=data, x=data.index, y=col, ax=ax, **kwargs)

    else:
      raise ValueError(f"Invalid plot_type argument: {plot_type}")

    ax.set_title(f"Distribution of {col}")