In [2]:
import random
import torch
import itertools

import numpy as np
import pandas as pd

from tqdm import tqdm

In [3]:
def seed_everything(seed=13):
    random.seed(seed)
#     from os import environ
#     environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

In [4]:
def make_variables(df, target_name, test=None):
    
    global numerical_feats, categorical_feats, numerical_pairs, target_pairs
    global target, empties
    target = target_name
    
    numerical_feats = df.select_dtypes(exclude=['object', 'category',
                                        'string', 'boolean']).columns
    
    categorical_feats = df.select_dtypes(include=['object', 'category',
                                      'string', 'boolean']).columns
    
    train_nans = set(df.columns[df.isna().any(axis=0)])
    if test is not None:
        test_nans = set(test.columns[test.isna().any(axis=0)])
        empties = dict.fromkeys(train_nans | test_nans)
    elif test is None:
        empties = dict.fromkeys(train_nans)
    
    numerical_pairs = list(itertools.combinations(numerical_feats, 2))
    target_pairs = [(col, target) for col in numerical_feats]

In [5]:
def get_optimized_column(col):
    try:
        if not np.issubdtype(col.dtypes, np.number):
            return col
    except TypeError:
        return col

    integers = [np.int8, np.int16, np.int32, np.int64]
    floats = [np.float16, np.float32, np.float64]
    max = col.max()
    relevant_types = integers if np.issubdtype(col.dtypes,
                                               np.integer) else floats
    for dtype in relevant_types:
        try:
            if dtype(max) == max:
                return col.astype(dtype)
        except OverflowError:
            continue
    return col


def reduce_mem_usage(df, verbose=True):
    start_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        for col in tqdm(df.columns):
            df[col] = get_optimized_column(df[col])
    else:
        for col in df.columns:
            df[col] = get_optimized_column(df[col])
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(
            end_mem, 100 * (start_mem - end_mem) / start_mem))