# Load Data

In [None]:
import os
import pandas as pd

DATA_PATH = os.path.join("datasets", PROJECT_ID)
def load_data(data_path=DATA_PATH):
    csv_path = os.path.join(titanic_data_path, "train.csv")
    return pd.read_csv(csv_path)
os.path

# Util

In [None]:
import numpy as np
import os

np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save associated data/images
PROJECT_ROOT_DIR = "."
PROJECT_ID = "__"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", PROJECT_ID)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [None]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

# Dataframe Transformations

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

### Single Column Function Transformation

In [None]:
class DataframeColumnFunctionTransformer():
    def __init__(self, func, column=None):
        self.func = func
        self.column = column

    def transform(self, input_df, **transform_params):
        return self.func(input_df, self.column)

    def fit(self, X, y=None, **fit_params):
        return self
    
def binarize(input_df, column):
    input_df[column] = input_df[column].notna()
    return input_df

def bool_to_int(input_df, column): 
    input_df[column] = input_df[column].map(lambda b: int(b))
    return input_df

### Multiple Column Function Transformation

In [None]:
class DataframeMultipleColumnFunctionTransformer():
    def __init__(self, func, columns=[]):
        self.func = func
        self.columns = columns

    def transform(self, input_df, **transform_params):
        return self.func(input_df, self.columns)

    def fit(self, X, y=None, **fit_params):
        return self

def features_drop(df, columns):
    return df.drop(columns, axis=1)

# Text Data Transformations

### To Word Vector

In [None]:
import re
from collections import Counter
from sklearn.base import BaseEstimator, TransformerMixin

class WordCounterTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, strip_headers=True, lower_case=True, remove_punctuation=True,
                 replace_urls=True, replace_numbers=True, stemming=True):
        self.strip_headers = strip_headers
        self.lower_case = lower_case
        self.remove_punctuation = remove_punctuation
        self.replace_urls = replace_urls
        self.replace_numbers = replace_numbers
        self.stemming = stemming
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X_transformed = []
        for text in X:
            if self.lower_case:
                text = text.lower()
            if self.replace_urls and url_extractor is not None:
                urls = list(set(url_extractor.find_urls(text)))
                urls.sort(key=lambda url: len(url), reverse=True)
                for url in urls:
                    text = text.replace(url, " URL ")
            if self.replace_numbers:
                text = re.sub(r'\d+(?:\.\d*(?:[eE]\d+))?', 'NUMBER', text)
            if self.remove_punctuation:
                text = re.sub(r'\W+', ' ', text, flags=re.M)
            word_counts = Counter(text.split())
            if self.stemming and stemmer is not None:
                stemmed_word_counts = Counter()
                for word, count in word_counts.items():
                    stemmed_word = stemmer.stem(word)
                    stemmed_word_counts[stemmed_word] += count
                word_counts = stemmed_word_counts
            X_transformed.append(word_counts)
        return np.array(X_transformed)

In [None]:
from scipy.sparse import csr_matrix

class WordCounterToVectorTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, vocabulary_size=1000):
        self.vocabulary_size = vocabulary_size
    def fit(self, X, y=None):
        total_count = Counter()
        for word_count in X:
            for word, count in word_count.items():
                total_count[word] += min(count, 10)
        most_common = total_count.most_common()[:self.vocabulary_size]
        self.most_common_ = most_common
        self.vocabulary_ = {word: index + 1 for index, (word, count) in enumerate(most_common)}
        return self
    def transform(self, X, y=None):
        rows = []
        cols = []
        data = []
        for row, word_count in enumerate(X):
            for word, count in word_count.items():
                rows.append(row)
                cols.append(self.vocabulary_.get(word, 0))
                data.append(count)
        return csr_matrix((data, (rows, cols)), shape=(len(X), self.vocabulary_size + 1))

# Numerical stuff

In [None]:
import numpy as np

def argmedian(x):
  return np.argpartition(x, len(x) // 2)[len(x) // 2]

In [None]:
import re

