**Sprint 2: Data Wrangling**

In [None]:
#Dependencies
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
from matplotlib import pyplot as plt
import os
import numpy as np
import pandas as pd
import pickle
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack
print(os.listdir("../data"))

['site_feature_names.npy', 'X_train_baseline.npz', 'baseline_logreg_submission.csv', 'y_train.npy', 'best_params_logreg_submission.csv', 'X_test_engineered.npz', 'site_dic.pkl', 'test_sessions.csv', 'X_train_engineered.npz', 'sample_submission.csv', 'best_params_xgb_submission.csv', 'X_test_baseline.npz', 'feature_selection_logreg_submission.csv', 'train_sessions.csv', 'best_params_oversampled_logreg_submission.csv']


**Load the dataset**

In [2]:
PATH_TO_DATA = ('../data')
train_df = pd.read_csv(os.path.join(PATH_TO_DATA, 'train_sessions.csv'), index_col='session_id')
test_df = pd.read_csv(os.path.join(PATH_TO_DATA, 'test_sessions.csv'), index_col='session_id')

**Basic Preprocessing**

Convert timestamps into pd.datetime

In [3]:
#list columns for easy access
sites_cols = ['site%s' % i for i in range(1, 11)]
times_cols = ['time%s' % i for i in range(1,11)]

In [4]:
#convert timestamps to pd.datetime
train_df[times_cols] = train_df[times_cols].apply(pd.to_datetime)
test_df[times_cols] = test_df[times_cols].apply(pd.to_datetime)

In [5]:
train_df = train_df.sort_values(by = 'time1')
test_df = test_df.sort_values(by = 'time1')

In [6]:
test_df.head()

Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,site6,time6,site7,time7,site8,time8,site9,time9,site10,time10
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
65540,21,2014-05-01 17:14:03,,NaT,,NaT,,NaT,,NaT,,NaT,,NaT,,NaT,,NaT,,NaT
64199,23,2014-05-02 07:52:08,66.0,2014-05-02 07:54:08,63.0,2014-05-02 07:54:08,2626.0,2014-05-02 07:55:09,,NaT,,NaT,,NaT,,NaT,,NaT,,NaT
2268,979,2014-05-02 07:57:51,73.0,2014-05-02 07:59:34,,NaT,,NaT,,NaT,,NaT,,NaT,,NaT,,NaT,,NaT
29734,66,2014-05-02 08:05:16,69.0,2014-05-02 08:05:17,67.0,2014-05-02 08:05:17,70.0,2014-05-02 08:05:17,71.0,2014-05-02 08:05:17,68.0,2014-05-02 08:05:17,71.0,2014-05-02 08:05:18,70.0,2014-05-02 08:05:18,69.0,2014-05-02 08:05:18,67.0,2014-05-02 08:05:18
77048,167,2014-05-02 08:05:32,167.0,2014-05-02 08:05:33,359.0,2014-05-02 08:05:34,167.0,2014-05-02 08:05:34,167.0,2014-05-02 08:05:35,305.0,2014-05-02 08:09:19,306.0,2014-05-02 08:09:20,306.0,2014-05-02 08:09:22,979.0,2014-05-02 08:09:54,68.0,2014-05-02 08:12:46


**Feature Engineering**

Some features explored in the EDA showed significant differences between Alice and Intruder. Let's put them into the dataset.

In [7]:
class AttributesAdder(BaseEstimator, TransformerMixin):
    """
    Transformer that adds categorical and time-based features to each session.

    Features added:
    - period1: 1 if the session starts at 12-13h or 18-19h, else 0
    - period2: 1 if the session starts at 16-18h, else 0
    - period3: 1 if the session starts at 0-12h, 14-15h, or 19-24h, else 0
    - peak_alice_months: 1 if the session starts in November, February, or March, else 0
    - mon_tue: 1 if the session starts on Monday or Tuesday, else 0
    - wed_sat_sun: 1 if the session starts on Wednesday, Saturday, or Sunday, else 0
    - year: Numeric year value of the session start

    Returns:
        np.ndarray: Array with the new features as columns.
    """
    def fit(self, X, y=None):
        return self 
    def transform(self, X, y=None):
        time1 = X['time1']
        hour = time1.dt.hour
        # Divide the day into 3 periods. Period2 is peak Alice's active times, while in period1 and period3 intruders are more active. 
        period1 = (((hour >= 12) & (hour <= 13)) | ((hour >= 18) & (hour <= 19))).astype(int)
        period2 = ((hour >= 16) & (hour <= 18)).astype(int)
        period3 = (((hour >= 0) & (hour <= 12)) | ((hour >= 14) & (hour <= 15)) | ((hour >= 19) & (hour <= 24))).astype(int)
        month = time1.dt.month
        # Months in which Alice is most active
        peak_alice_months = ((month == 11) | (month == 2) | (month == 3)).astype(int)
        weekday = time1.dt.weekday
        # Mon and Tue are when Alice is the most active. Wed, Sat and Sun are days when intruders are more active.
        mon_tue = ((weekday == 0) | (weekday == 1)).astype(int)
        wed_sat_sun = ((weekday == 2) | (weekday == 5) | (weekday == 6)).astype(int)
        X_new = np.c_[
            period1.values, period2.values, period3.values,
            peak_alice_months.values, mon_tue.values, wed_sat_sun.values
        ]
        return X_new

In [8]:
class ScaledAttributesAdder(BaseEstimator, TransformerMixin):
    """
    Transformer that adds two features to each session:

    - session_duration: The duration of the session in seconds, transformed by raising to the power of 0.2.
      Calculated as the difference between the latest and earliest timestamps among all session times.
    - start_month: The session's start month, encoded as (year * 100 + month) and scaled.
    - start_week: The session's start week, encoded as (year * 100 + weekofyear) and scaled.

    Returns:
        np.ndarray: Array with session_duration and start_month as columns.
    """
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        times = ['time%s' % i for i in range(1, 11)]
        # Start month and Start week are in the form YYYYMM and YYYYWW. Therefore we need to scale it by dividing by 100000. 
        start_month = X['time1'].apply(lambda t: 100 * t.year + t.month).to_numpy() / 1e5
        start_week = X['time1'].apply(lambda t: 100 * t.year + t.isocalendar().week).to_numpy() / 1e5
        # Duration in seconds, to the power of 0.2 for normalization
        session_duration = ((X[times].max(axis=1) - X[times].min(axis=1)).dt.total_seconds() ** 0.2).to_numpy().ravel()
        X = np.c_[session_duration, start_month, start_week]
        return X

TF-IDF (Term Frequency-Inverse Document Frequency) is a statistical measure used to evaluate the importance of a word in a document relative to a collection of documents (corpus). 
Parameters:
- ngram-range: the length of ngrams to use. Here we want sequences of 1 to 5 urls.
- max_features: the maximum amount of features to include. Higher might overfit.
- tokenizer: what pattern to split by. Here we want to include the "." in urls, so the split must be defined as a whitespace.

We use this to correlate the frequency of visited sites to the user.

In [9]:
# From Yury Kashnitsky's notebook - Model validation in a competition
PATH_TO_DATA = '../data'
path_to_train = os.path.join(PATH_TO_DATA, 'train_sessions.csv')
path_to_test = os.path.join(PATH_TO_DATA, 'test_sessions.csv')
path_to_site_dict = os.path.join(PATH_TO_DATA, 'site_dic.pkl')
vectorizer_params = dict(ngram_range=(1, 5), max_features=50000, tokenizer = lambda s: s.split())

def prepare_sparse_features(path_to_train, path_to_test, path_to_site_dict,
                           vectorizer_params):
    times = ['time%s' % i for i in range(1, 11)]
    train_df = pd.read_csv(path_to_train,
                       index_col='session_id', parse_dates=times)
    test_df = pd.read_csv(path_to_test,
                      index_col='session_id', parse_dates=times)

    # Sort the data by time
    train_df = train_df.sort_values(by='time1')
    
    # read site -> id mapping provided by competition organizers 
    with open(path_to_site_dict, 'rb') as f:
        site2id = pickle.load(f)
    # create an inverse id _> site mapping
    id2site = {v:k for (k, v) in site2id.items()}
    # we treat site with id 0 as "unknown"
    id2site[0] = 'unknown'
    
    # Transform data into format which can be fed into TfidfVectorizer
    # This time we prefer to represent sessions with site names, not site ids. 
    # It's less efficient but thus it'll be more convenient to interpret model weights.
    sites = ['site%s' % i for i in range(1, 11)]
    train_sessions = train_df[sites].fillna(0).astype('int').apply(lambda row: 
                                                     ' '.join([id2site[i] for i in row]), axis=1).tolist()
    test_sessions = test_df[sites].fillna(0).astype('int').apply(lambda row: 
                                                     ' '.join([id2site[i] for i in row]), axis=1).tolist()
    # we'll tell TfidfVectorizer that we'd like to split data by whitespaces only 
    # so that it doesn't split by dots (we wouldn't like to have 'mail.google.com' 
    # to be split into 'mail', 'google' and 'com')
    vectorizer = TfidfVectorizer(**vectorizer_params)
    X_train = vectorizer.fit_transform(train_sessions)
    X_test = vectorizer.transform(test_sessions)
    y_train = train_df['target'].astype('int').values
    
    # we'll need site visit times for further feature engineering
    train_times, test_times = train_df[times], test_df[times]
    
    return X_train, X_test, y_train, vectorizer, train_times, test_times

In [10]:
X_train, X_test, y_train, vectorizer, train_times, test_times = prepare_sparse_features(
    path_to_train, path_to_test, path_to_site_dict, vectorizer_params
)

In [11]:
print(X_train.shape, X_test.shape)

(253561, 50000) (82797, 50000)


In [12]:
vectorizer.get_feature_names_out()[:10]

array(['0.academia-assets.com', '0.docs.google.com',
       '0.docs.google.com 0.docs.google.com',
       '0.docs.google.com 0.docs.google.com 0.docs.google.com',
       '0.docs.google.com 0.docs.google.com 0.docs.google.com 0.docs.google.com',
       '0.docs.google.com 0.docs.google.com 0.drive.google.com',
       '0.docs.google.com 0.docs.google.com 0.talkgadget.google.com',
       '0.docs.google.com 0.docs.google.com apis.google.com',
       '0.docs.google.com 0.docs.google.com docs.google.com',
       '0.docs.google.com 0.docs.google.com docs.google.com 0.talkgadget.google.com'],
      dtype=object)

We make 2 feature engineer pipelines:
* feature_pipeline: Returns a 2D-array of engineered features from the given dataset.
* scaled_pipeline: Returns a 2D-array of engineered features (with scaling) from the given dataset.

In [13]:
#Initialize the pipelines
feature_pipeline = Pipeline([('feature_engineering', AttributesAdder())])

scaled_pipeline = Pipeline([
    ('scaled_feature_adder', ScaledAttributesAdder()),
    ('scaler', StandardScaler())
])

In [14]:
feature_pipeline

FeatureUnion performs the transformation processes in parallel, concatenating them together at the end.

In [15]:
no_vectorizer_pipeline = FeatureUnion(transformer_list=[
    ('feature_pipeline', feature_pipeline),
    ('scaled_pipeline', scaled_pipeline)
])

Apply the all preprocessing processes to the main dataset.

In [16]:
engineered_train = no_vectorizer_pipeline.fit_transform(train_df)
engineered_test = no_vectorizer_pipeline.transform(test_df)
X_train_full = hstack([X_train, engineered_train])
X_test_full = hstack([X_test, engineered_test])
y_train = train_df["target"].astype('int').values

We will make a pipeline without the vectorizer to analyse the other transformation processes.

In [17]:
feature_columns = [
    "period1", "period2", "period3", "peak_alice_months", "mon_tue", "wed_sat_sun"
]
scaled_columns = [
    "session_duration", "start_month", "start_week"
]

X_train_no_vectorizer = no_vectorizer_pipeline.fit_transform(train_df)
X_test_no_vectorizer = no_vectorizer_pipeline.transform(test_df)

X_train_no_tokenizer_df = pd.DataFrame(
    X_train_no_vectorizer, 
    columns=feature_columns + scaled_columns
)

In [18]:
#Check if the features were added correctly
X_train_no_tokenizer_df.describe()

Unnamed: 0,period1,period2,period3,peak_alice_months,mon_tue,wed_sat_sun,session_duration,start_month,start_week
count,253561.0,253561.0,253561.0,253561.0,253561.0,253561.0,253561.0,253561.0,253561.0
mean,0.179089,0.139876,0.771183,0.605661,0.351679,0.311964,1.051828e-13,-9.700872e-09,-7.050718e-09
std,0.383428,0.346859,0.420072,0.488709,0.477495,0.463296,1.000002,1.000002,1.000002
min,0.0,0.0,0.0,0.0,0.0,0.0,-2.379876,-1.744405,-2.659214
25%,0.0,0.0,1.0,0.0,0.0,0.0,-0.6786939,-1.485314,-1.180984
50%,0.0,0.0,1.0,1.0,0.0,0.0,-0.1514208,0.6345182,0.568757
75%,0.0,0.0,1.0,1.0,1.0,1.0,0.587324,0.6580719,0.6894289
max,1.0,1.0,1.0,1.0,1.0,1.0,2.781739,0.6816256,0.8704366


The ranges of features are within the expected values. Tranformation processes were applied correctly.

In [19]:
# The transformed dataset is very large. We sample a few rows to take a look.
sample = X_train[:1000].toarray()
print("NaNs in sample:", np.isnan(sample).sum())
print("Infs in sample:", np.isinf(sample).sum())
print("Sample rows:\n", sample)

# Check sparsity
print("Nonzero elements:", X_train.nnz)
print("Sparsity: {:.2f}%".format(100 * X_train.nnz / (X_train.shape[0] * X_train.shape[1])))

# Check min/max (Make sure everything is scaled)
print("Sample min:", sample.min())
print("Sample max:", sample.max())

NaNs in sample: 0
Infs in sample: 0
Sample rows:
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
Nonzero elements: 4112650
Sparsity: 0.03%
Sample min: 0.0
Sample max: 0.9765530230425825


The dataset looks good, without NaNs, infs or large outliers.

We extract the transformed dataset into new files. The transformed dataset is very sparse, and therefore must be stored in .npz files, which are more efficient.

In [20]:
from scipy import sparse
sparse.save_npz('../data/X_train_baseline.npz', X_train)
sparse.save_npz('../data/X_test_baseline.npz', X_test)
sparse.save_npz('../data/X_train_engineered.npz', X_train_full)
sparse.save_npz('../data/X_test_engineered.npz', X_test_full)
np.save('../data/y_train.npy', y_train)
np.save("../data/site_feature_names.npy", vectorizer.get_feature_names_out())