In [0]:
!pip install --upgrade pip -q
!pip install progressbar -q
!pip install memory_profiler -q
!pip install --upgrade pandas>=1.2 -q

In [0]:
%load_ext memory_profiler

In [0]:
import urllib
import tarfile
import os
from collections import OrderedDict
import warnings
import random

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import progressbar
from scipy.sparse import csr_matrix

from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.feature_extraction import FeatureHasher
from sklearn import linear_model
from sklearn.metrics import roc_auc_score, log_loss

## Download Criteo  Display Advertising Challenge dataset

In [0]:
# ProgressBar borrowed from https://stackoverflow.com/a/53643011/2015762
class ProgressBar():
    def __init__(self):
        self.pbar = None

    def __call__(self, block_num, block_size, total_size):
        if not self.pbar:
            self.pbar=progressbar.ProgressBar(maxval=total_size)
            self.pbar.start()

        downloaded = block_num * block_size
        if downloaded < total_size:
            self.pbar.update(downloaded)
        else:
            self.pbar.finish()


def download_dataset(dataset_url, dataset_folder_path, compressed_dataset_path):
    # Download dataset
    os.makedirs(dataset_folder_path, exist_ok=True)
    urllib.request.urlretrieve(dataset_url, compressed_dataset_path, ProgressBar())

def extract_dataset(compressed_dataset_path, dataset_folder_path, dataset_path):
    # Extract train.txt (dataset with labels) and readme
    with tarfile.open(compressed_dataset_path, "r") as input_file:
        input_file.extract('readme.txt', dataset_folder_path)
        input_file.extract('train.txt', dataset_folder_path)
        os.rename(os.path.join(dataset_folder_path, 'train.txt'), dataset_path)

In [0]:
dataset_url = "https://criteostorage.blob.core.windows.net/criteo-research-datasets/kaggle-display-advertising-challenge-dataset.tar.gz"
dataset_folder_path = os.path.abspath('sync/data/criteo_dataset')
compressed_dataset_path = os.path.join(dataset_folder_path, "criteo_dataset.tar.gz")
dataset_path = os.path.join(dataset_folder_path, "criteo_dataset.txt")

In [0]:
if not os.path.exists(compressed_dataset_path):
    download_dataset(dataset_url, dataset_folder_path, compressed_dataset_path)

if not os.path.exists(dataset_path):
    extract_dataset(compressed_dataset_path, dataset_folder_path, dataset_path)

If it takes too much time, download this smaller dataset instead

In [0]:
# toy_dataset_path = os.path.join(dataset_folder_path, "criteo_toy_dataset.txt")
# toy_dataset_url = 'https://www.dropbox.com/s/dle2t3szhljfevh/criteo_toy_dataset.txt?dl=1'
# urllib.request.urlretrieve(toy_dataset_url, toy_dataset_path, ProgressBar())

Quick look at the files we have downloaded.

Within iPython notebook, we can execute bash command by prepending the cell with `!` and insert python variable into it with `{}`

In [0]:
!ls -alh {dataset_folder_path}

In [0]:
!cat {dataset_folder_path}/readme.txt

In [0]:
label_columns = ['label']
integer_features = [f'int_feat_{i}' for i in range(1, 14)]
categorical_features = [f'cat_feat_{i}' for i in range(1, 27)]
columns = label_columns + integer_features + categorical_features

In [0]:
pd.read_csv(dataset_path, nrows=10, header=None, sep='\t', names=columns)

## Reading data with memory constraints

We first create a toy dataset with "only" 1 million rows (out of 45 millions)

In [0]:
toy_dataset_path = os.path.join(dataset_folder_path, "criteo_toy_dataset.txt")

In [0]:
!head -n 1000000 {dataset_path} > {toy_dataset_path}

Let's say we want to perform a basic operation: estimate the number of positive samples within the data

### Basic approach

In [0]:
def compute_positive_label_proportion(dataset_path, columns):
    df = pd.read_csv(dataset_path, sep="\t", header=None, names=columns, usecols=['label'])
    return df['label'].mean()#OK

Let's measure its memory footprint with the `%%memit` magic function

In [0]:
%%memit
positive_label_proportion = compute_positive_label_proportion(toy_dataset_path, columns)
print('positive_label_proportion', positive_label_proportion)

What would happen if you run the same function on a 45 times bigger dataset ?

You can give a try with `compute_positive_label_proportion(dataset_path, columns)`... at your own risks.

### Specifying column types
We can help pandas by specifying the column types to be used such that it does not need to infer it. Do so with the parameter dtype of pd.read_csv: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html.

In [0]:
col_types = OrderedDict()
# Python OrderedDict is a dict subclass that maintains the items insertion order. 
for col_name in columns:
    if col_name in label_columns: col_type = 'bool'
    if col_name in integer_features: col_type = 'float32'
    if col_name in categorical_features: col_type = 'str'
    col_types[col_name] = col_type#OK

def compute_positive_label_proportion_with_dtype(dataset_path, columns, col_types):
    # Read csv with dtype and return positive_label_proportion
    df = pd.read_csv(dataset_path, sep="\t", header=None, names=columns, dtype=col_types)
    return df['label'].mean()

In [0]:
%%memit
positive_label_proportion = compute_positive_label_proportion_with_dtype(toy_dataset_path, columns, col_types)
print('positive_label_proportion', positive_label_proportion)

### Reading data by chunks
We can control the amount of memory we need by loading only a small chunk of the data and processing it before moving to the next chunk.

See documentation at https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#iterating-through-files-chunk-by-chunk

```
reader = pd.read_csv(..., chunksize=10, nrows=100):
for chunk in reader:
    print(chunk)
```

In [0]:
#By specifying a chunksize to read_csv, the return value will be an iterable object of type TextFileReader:

def compute_positive_label_proportion_with_dtype_and_chunksize(dataset_path, columns, col_types, chunksize):
    # Read csv with dtype and chunksize and return positive_label_proportion
    reader = pd.read_csv(
        dataset_path, sep="\t", header=None, names=columns, dtype=col_types, chunksize=chunksize, 
    )#apparently we can use the read_csv method for a text file                                 
    sum_labels = 0
    sum_rows = 0
    for chunk in reader:
        sum_labels += chunk['label'].sum()# on calcule le nombre de label positif 
        sum_rows += len(chunk)
    return sum_labels / sum_rows#renvoie la proportion de labels positifs dans la dataset

In [0]:
%%memit
positive_label_proportion = compute_positive_label_proportion_with_dtype_and_chunksize(toy_dataset_path, columns, col_types, 100_000)
print('positive_label_proportion', positive_label_proportion)

This can now be applied to the full dataset with no memory issue.

In [0]:
%%memit
positive_label_proportion = compute_positive_label_proportion_with_dtype_and_chunksize(dataset_path, columns, col_types, 100_000)
print('positive_label_proportion', positive_label_proportion)

## Training and evaluation

### Split train and test datasets
Since the datasets contain one line per example, we can split them into train and test by simply iterating over the lines. For each line in the original dataset: write it to the test data set with a probability p and write it to the train dataset with a probability 1 - p.

In [0]:
def split_train_test(full_dataset_path, train_dataset_path, test_dataset_path, test_ratio, seed=302984, print_every=None):
    random.seed(seed)
    with open(full_dataset_path, 'r') as input_file, open(train_dataset_path, 'w') as train_file, open(test_dataset_path, 'w') as test_file:
        for i, line in enumerate(input_file):
            if random.uniform(0, 1) <= test_ratio:
                test_file.write(line)
            else:
                train_file.write(line)
            
            if print_every is not None and (i + 1) % print_every == 0:
                print(f"Processed {i + 1} lines")
        print(f"Processed {i + 1} lines")
        
train_dataset_path = os.path.join(dataset_folder_path, "criteo_train_dataset.txt")
test_dataset_path = os.path.join(dataset_folder_path, "criteo_test_dataset.txt")

In [0]:
if not os.path.exists(train_dataset_path) or not os.path.exists(test_dataset_path):
    split_train_test(dataset_path, train_dataset_path, test_dataset_path, test_ratio=0.1, print_every=10_000_000)

In [0]:
!wc -l {test_dataset_path}

### Shuffling
The convergence guarantees of SGD rely on the fact that the observations come at random. Hence, shuffling between epochs is important.

First result of "How to shuffle a file that is too big for memory" on Google: https://stackoverflow.com/a/40814865/2015762

Note that quicker pseudo-shuffling strategies exists, but this fits our "Big data on your laptop" problematic.

In [0]:
!awk 'BEGIN{srand();} {printf "%06d %s\n", rand()*1000000, $0;}' /databricks/driver/sync/data/criteo_dataset/criteo_test_dataset.txt | sort -n | cut -c8- > /databricks/driver/sync/data/criteo_dataset/criteo_test_dataset_shuffled.txt
# We can run it on the train dataset too but let'ss skip it since it is quite long
# !awk 'BEGIN{srand();} {printf "%06d %s\n", rand()*1000000, $0;}' /databricks/driver/sync/data/criteo_dataset/criteo_train_dataset.txt | sort -n | cut -c8- > /databricks/driver/sync/data/criteo_dataset/criteo_train_dataset_shuffled.txt

### Training
In order to train a logistic model on chunks of data, we will use scikit-learn `SGDClassifier` (https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html) and train for its `log` loss with its `partial_fit` method.
We can now apply the previous data processing pipeline and add the training to obtain a trained classifier.

In [0]:
#  To begin with, let's not do any preprocessing and deal with "ready to use" continuous features only
def preprocess_data(chunk, integer_features, categorical_features):
    return chunk[integer_features].fillna(-1)

In [0]:
max_training_steps = 10_000
chunk_size = 1_000
print_every = 1000

classifier = SGDClassifier(loss="log")
# 1. Read train data by chunks
reader = pd.read_csv(
    train_dataset_path, sep="\t", header=None, names=columns, dtype=col_types, chunksize=chunk_size, 
)
for i, chunk in enumerate(reader):
    # 2. Apply preprocess_data to return the continous features
    features = preprocess_data(chunk, integer_features, categorical_features)
    # 3. Train classifier on this chunk  with fit.
    classifier.partial_fit(features, chunk["label"], classes=[0, 1])
    # 4. Stop after `max_training_steps`
    if i > max_training_steps:
        break
        
    if print_every is not None and (i + 1) % print_every == 0:
        print(i+1)

### Testing
Let's evaluate the performances of the trained classifier. We should iterate over the test dataset and evaluate the labels predicted by the classifier with `roc_auc_score` and `log_loss`.

In [0]:
max_testing_steps = 1_000
chunk_size = 1_000
print_every = 100

reader = pd.read_csv(
    test_dataset_path, sep="\t", header=None, names=columns, dtype=col_types, chunksize=chunk_size, 
)

roc_auc_scores = []
log_losses = []
# 1. Read test data by chunks
for i, chunk in enumerate(reader):
    # 2. Apply preprocess_data to return the continous features
    features = preprocess_data(chunk, integer_features, categorical_features)
    # 3. Predict labels with classifiers
    label_predictions = classifier.predict_proba(features)[:, 1]
    # 4. Compute AUC score and Log loss for this chunk
    roc_auc_scores += [roc_auc_score(chunk["label"], label_predictions)]
    log_losses += [log_loss(chunk["label"], label_predictions)]
    
    if i > max_testing_steps:
        break
        
    if print_every is not None and (i + 1) % print_every == 0:
        print(i+1)

# 6. Return averaged values of the metrics
print(f"AUC = {np.mean(roc_auc_scores)}")
print(f"LogLoss = {np.mean(log_losses)}")

## Data preprocessing

### Continuous features
A smart way to deal with continuous features (counting integer features are part of them), consists in transforming them into categorical features through a quantile transformation. To do so we will use scikit-learn KBinsDiscretizer : https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.KBinsDiscretizer.html.

It can be used as following
```
df = pd.DataFrame({'col_1': np.random.normal(size=1000), 'col_2': np.random.poisson(lam=1, size=1000)})
bucketizer = KBinsDiscretizer(n_bins=20, encode='ordinal')
bucketizer.fit(df)
df_bucketized = pd.DataFrame(bucketizer.transform(df), columns=[f'{col}_bucketized' for col in df.columns], index=df.index)
sns.jointplot(data=pd.concat((df, df_bucketized), axis=1), x="col_1", y="col_1_bucketized")
```

1. Create a `KBinsDiscretizer` and train it on the first chunk of the dataset
1. Update `preprocess_data` to add a bucketize step to the training pipeline. What happens if you change the `encode` parameter?
1. Do not forget to deal with missing values, you do not want to carry on NaNs. You can for example replace them with -1.

In [0]:
bucketizer = KBinsDiscretizer(n_bins=20) # Try with and without encode='ordinal'
with warnings.catch_warnings(record=True):
    bucketizer.fit(chunk[integer_features].fillna(-1))
    
def bucketize(df, bucketizer):
    return bucketizer.transform(df.fillna(-1))

def preprocess_data(chunk, integer_features, categorical_features):
    return bucketize(chunk[integer_features], bucketizer)

preprocess_data(chunk, integer_features, categorical_features)

### Categorical features
For categorical features we will implement the hashing trick by ourselves. As a quick reminder, for each row

1. Select the categorical features 
1. Create for each feature the string concatenating the feature name and the feature value
1. Apply a hash function to each of these string and use this value to choose the feature's column index
1. Store the transformed features in a sparse matrix

In [0]:
from sklearn.utils.murmurhash import murmurhash3_bytes_s32

def hash_string(string, seed=0):
    return murmurhash3_bytes_s32(string.encode(), seed)

#Since Python 3.0, strings are stored as Unicode, i.e. each character in the string is represented by a code point. So, each string is just a sequence of Unicode code points.
  
hash_string('my_feature=my_feature_value')
# Note, if we were using builtin function hash('my_feature=my_feature_value'), we would have had a different hash value at each run

In [0]:
hash_space = 2 ** 16 #valeur à ne pas dépasser

row = chunk.iloc[0]# première ligne du chunk
def get_features_hashes(row, hash_space):#cross hashing j'ai l'impression à cause de la présence row.values
    # return the list of the hashes values for each categorical feature in the row
    #f'' is converting to string format in python 3
    features_as_string = [f"{label}={value}" for label, value in zip(row.index, row.values)]
    # https://github.com/scikit-learn/scikit-learn/blob/95119c13af77c76e150b753485c662b7c52a41a2/sklearn/feature_extraction/_hashing_fast.pyx#L68
    return [abs(hash_string(string)) % hash_space for string in features_as_string]

np.array(get_features_hashes(row, hash_space))# OK

In [0]:
def transform_with_hashing_trick(df, hash_space):
    col_indices = df.apply(lambda row: get_features_hashes(row, hash_space), axis=1) #why axis 1 ?
    row_indices = [
        [row_index] * len(cols)
        for row_index, cols in zip(np.arange(len(col_indices)), col_indices)
    ]

    flat_col_indices = sum(col_indices.values, [])# sum(iterable,start): somme les contenus de iterable avec sum=start comme état initial
    flat_row_indices = sum(row_indices, [])
    data = np.ones_like(flat_col_indices)#matrice remplie de 1
    # Fill the csr_matrix, using csr_matrix((data, (row_ind, col_ind)), [shape=(M, N)]) constructor
    # See https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csr_matrix.html
    return csr_matrix((data, (flat_row_indices, flat_col_indices)), shape=(len(df), hash_space), dtype=float)
  
transform_with_hashing_trick(chunk[categorical_features], hash_space)

Actually, the hashing trick is well known and already implemented in scikit-learn FeatureHasher: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.FeatureHasher.html

It can be used as following
```
df = pd.DataFrame({'col_1': np.random.choice(['a', 'b', 'c'], size=100), 'col_2': np.random.poisson(size=100)})
hasher = FeatureHasher(n_features=2**16, input_type="dict")
hasher.transform((row._asdict() for row in df.itertuples(index=False)))
```
Again, you will probably want to ensure you get rid of the NaNs. What value could you set for these?

In [0]:
hasher = FeatureHasher(n_features=hash_space, input_type="dict")

def feature_hashing(df, hasher):
    # apply hasher.transorm to rows of df
    return hasher.transform((row._asdict() for row in df.fillna("nan").itertuples(index=False)))

We can compare their speed

In [0]:
%timeit transform_with_hashing_trick(chunk[categorical_features], hash_space)

In [0]:
%timeit feature_hashing(chunk[categorical_features], hasher)

Actually we have used exactly the same implementation as sklearn.
See https://github.com/scikit-learn/scikit-learn/blob/95119c13af77c76e150b753485c662b7c52a41a2/sklearn/feature_extraction/_hashing_fast.pyx#L68

In [0]:
hash_space = 20
df = chunk[categorical_features].head(5)
print('Custom implementation')
print(transform_with_hashing_trick(df, hash_space).toarray())
hasher = FeatureHasher(n_features=hash_space, input_type="dict", alternate_sign=False)
print('sklearn implementation')
print(feature_hashing(df, hasher).toarray())

Let's improve our previous pipeline and apply the hashing trick to the categorical features **and** to the bucketized continuous features (have a look at `pd.concat`).

In [0]:
hash_space = 2 ** 20

hasher = FeatureHasher(n_features=hash_space, input_type="dict")
bucketizer = KBinsDiscretizer(n_bins=20, encode='ordinal')
with warnings.catch_warnings(record=True):
    bucketizer.fit(chunk[integer_features].fillna(-1))

def bucketize(df, bucketizer):
    return pd.DataFrame(bucketizer.transform(df.fillna(-1)), columns=df.columns, index=df.index)

def preprocess_data(df, integer_features, categorical_features):
    bucketized_integer_features_df = bucketize(chunk[integer_features], bucketizer)
    categorical_features_df = pd.concat([chunk[categorical_features], bucketized_integer_features_df], axis=1)
    return feature_hashing(categorical_features_df, hasher)

preprocess_data(chunk, integer_features, categorical_features)

Going further with the hashing trick:
1. How could you implement cross features ?
1. If you are too afraid of collisions, try to hash each categorical feature to several locations.

Try with FeatureHasher and your custom implementation and see what is the most efficient.