**Sprint 2: Data Wrangling**

In [1]:
#Dependencies
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns
import sklearn
import os
import numpy as np
import pandas as pd
import pickle
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
print(os.listdir("../data"))

['site_dic.pkl', 'sample_submission.csv', 'test_sessions.csv', 'train_sessions.csv']


**Load the dataset**

In [2]:
PATH_TO_DATA = ('../data')
train_df = pd.read_csv(os.path.join(PATH_TO_DATA, 'train_sessions.csv'), index_col='session_id')
test_df = pd.read_csv(os.path.join(PATH_TO_DATA, 'test_sessions.csv'), index_col='session_id')

**Basic Preprocessing**

Convert timestamps into pd.datetime

In [3]:
#list columns for easy access
sites_cols = ['site%s' % i for i in range(1, 11)]
times_cols = ['time%s' % i for i in range(1,11)]

In [4]:
#convert timestamps to pd.datetime
train_df[times_cols] = train_df[times_cols].apply(pd.to_datetime)
test_df[times_cols] = test_df[times_cols].apply(pd.to_datetime)

In [5]:
train_df = train_df.sort_values(by = 'time1')
test_df = test_df.sort_values(by = 'time1')

In [6]:
test_df.head()

Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,site6,time6,site7,time7,site8,time8,site9,time9,site10,time10
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
65540,21,2014-05-01 17:14:03,,NaT,,NaT,,NaT,,NaT,,NaT,,NaT,,NaT,,NaT,,NaT
64199,23,2014-05-02 07:52:08,66.0,2014-05-02 07:54:08,63.0,2014-05-02 07:54:08,2626.0,2014-05-02 07:55:09,,NaT,,NaT,,NaT,,NaT,,NaT,,NaT
2268,979,2014-05-02 07:57:51,73.0,2014-05-02 07:59:34,,NaT,,NaT,,NaT,,NaT,,NaT,,NaT,,NaT,,NaT
29734,66,2014-05-02 08:05:16,69.0,2014-05-02 08:05:17,67.0,2014-05-02 08:05:17,70.0,2014-05-02 08:05:17,71.0,2014-05-02 08:05:17,68.0,2014-05-02 08:05:17,71.0,2014-05-02 08:05:18,70.0,2014-05-02 08:05:18,69.0,2014-05-02 08:05:18,67.0,2014-05-02 08:05:18
77048,167,2014-05-02 08:05:32,167.0,2014-05-02 08:05:33,359.0,2014-05-02 08:05:34,167.0,2014-05-02 08:05:34,167.0,2014-05-02 08:05:35,305.0,2014-05-02 08:09:19,306.0,2014-05-02 08:09:20,306.0,2014-05-02 08:09:22,979.0,2014-05-02 08:09:54,68.0,2014-05-02 08:12:46


**Feature Engineering**

In [7]:
class DataPreparator(BaseEstimator, TransformerMixin):
    """
    Transformer that fills NaN values in the site columns with zeros and converts them to integers.
    This prepares the site columns for further processing or vectorization.
    """
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        sites = ['site%s' % i for i in range(1, 11)]
        return X[sites].fillna(0).astype('int')

We combine the site id to a single string for the CountVectorizer.

In [8]:
class ListPreparator(BaseEstimator, TransformerMixin):
    """
    Transformer that converts each row of site IDs into a whitespace-separated string.
    This format is suitable for use with CountVectorizer.
    """
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X = X.values.tolist()
        return [" ".join([str(site) for site in row]) for row in X]

Some features explored in the EDA showed significant differences between Alice and Intruder. Let's put them into the dataset.

In [9]:
class AttributesAdder(BaseEstimator, TransformerMixin):
    """
    Transformer that adds time-based and categorical features:
    - Morning, day, evening indicators based on hour
    - Summer indicator based on month
    - Weekday and year as numeric features
    """
    def fit(self, X, y=None):
        return self 
    def transform(self, X, y=None):
        hour = X['time1'].apply(lambda ts: ts.hour)
        morning = ((hour >= 7) & (hour <= 11)).astype('int')
        day = ((hour >= 12) & (hour <= 18)).astype('int')
        evening = ((hour >= 19) & (hour <= 23)).astype('int')
        month = X['time1'].apply(lambda ts: ts.month)
        summer = ((month >= 6) & (month <= 8)).astype('int')
        weekday = X['time1'].apply(lambda ts: ts.weekday()).astype('int')
        year = X['time1'].apply(lambda ts: ts.year).astype('int')
        X = np.c_[morning.values, day.values, evening.values, summer.values, weekday.values, year.values]
        return X

In [10]:
class ScaledAttributesAdder(BaseEstimator, TransformerMixin):
    """
    Transformer that adds session duration as a feature, normalized using a power transform.
    The duration is calculated as the difference between the max and min timestamps in a session.
    """
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        times = ['time%s' % i for i in range(1, 11)]
        session_duration = (X[times].max(axis=1) - X[times].min(axis=1)).astype('timedelta64[ms]').astype(int) ** 0.2
        X = np.c_[session_duration.values]
        return X

We make 2 pipelines:
* vectorizer_pipeline: prepares the dataset for tokenizer by imputing NaNs in siteID columns, and combining them into a list of strings. CountVectorizer() converts a collection of text into a matrix of token counts. ngram_range of (1,3) means that the vectorizer will extract unigrams (single site IDs), bigrams (pairs of site IDs) and trigrams (triplets of site IDs). Only the 10000 most frequent n-grams are kept as features. Returns a a 2D-array of these features.\
For example, given array = ['00' , '01', '00', '02'], the CountVectorizer will return a matrix that counts up the frequency of each element (token): \
[['00', '01', '02'], \
[2, 1, 1]] 
* feature_pipeline: Returns a 2D-array of engineered features from the given dataset.
* scaled_pipeline: Returns a 2D-array of engineered features (with scaling) from the given dataset.

In [11]:
#Initialize the pipelines
vectorizer_pipeline = Pipeline([
    ('data_prep', DataPreparator()),
    ('tokenizer_prep', ListPreparator()),
    ('vectorizer', CountVectorizer(ngram_range=(1,3), max_features=60000))
])
feature_pipeline = Pipeline([('feature_engineering', AttributesAdder())])

scaled_pipeline = Pipeline([
    ('scaled_feature_adder', ScaledAttributesAdder()),
    ('scaler', StandardScaler())
])

FeatureUnion performs the transformation processes in parallel, concatenating them together at the end.

In [12]:
full_pipeline = FeatureUnion(transformer_list=[
    ('vectorizer_pipeline', vectorizer_pipeline),
    ('feature_pipeline', feature_pipeline),
    ('scaled_pipeline', scaled_pipeline)
])

no_vectorizer_pipeline = FeatureUnion(transformer_list=[
    ('feature_pipeline', feature_pipeline),
    ('scaled_pipeline', scaled_pipeline)
])

Apply the all preprocessing processes to the main dataset.

In [13]:
X_train = full_pipeline.fit_transform(train_df)
X_test = full_pipeline.transform(test_df)

y_train = train_df["target"].astype('int').values

We will make a pipeline without the vectorizer to analyse the other transformation processes.

In [14]:
# Use the same features as in the pipeline (AttributesAdder + ScaledAttributesAdder)
# These are: morning, day, evening, summer, weekday, year, session_duration

feature_columns = [
    "morning", "day", "evening", "summer", "weekday", "year"
]
scaled_columns = [
    "session_duration"
]

X_train_no_vectorizer = no_vectorizer_pipeline.fit_transform(train_df)
X_test_no_vectorizer = no_vectorizer_pipeline.transform(test_df)

X_train_no_tokenizer_df = pd.DataFrame(
    X_train_no_vectorizer, 
    columns=feature_columns + scaled_columns
)

In [15]:
#Check if the features were added correctly
X_train_no_tokenizer_df.describe()

Unnamed: 0,morning,day,evening,summer,weekday,year,session_duration
count,253561.0,253561.0,253561.0,253561.0,253561.0,253561.0,253561.0
mean,0.480851,0.490553,0.028597,0.00995,2.289741,2013.705495,1.145562e-16
std,0.499634,0.499912,0.16667,0.099254,1.610467,0.455821,1.000002
min,0.0,0.0,0.0,0.0,0.0,2013.0,-2.379876
25%,0.0,0.0,0.0,0.0,1.0,2013.0,-0.6786939
50%,0.0,0.0,0.0,0.0,2.0,2014.0,-0.1514208
75%,1.0,1.0,0.0,0.0,4.0,2014.0,0.587324
max,1.0,1.0,1.0,1.0,6.0,2014.0,2.781739


The ranges of features are within the expected values. Tranformation processes were applied correctly.

In [16]:
# The transformed dataset is very large. We sample a few rows to take a look.
sample = X_train[:5].toarray()
print("NaNs in sample:", np.isnan(sample).sum())
print("Infs in sample:", np.isinf(sample).sum())
print("Sample rows:\n", sample)

# Check sparsity
print("Nonzero elements:", X_train.nnz)
print("Sparsity: {:.2f}%".format(100 * X_train.nnz / (X_train.shape[0] * X_train.shape[1])))

# Check min/max (Make sure everything is scaled)
print("Sample min:", sample.min())
print("Sample max:", sample.max())

NaNs in sample: 0
Infs in sample: 0
Sample rows:
 [[ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  5.00000000e+00
   2.01300000e+03 -2.37987552e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  5.00000000e+00
   2.01300000e+03  2.77368466e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  5.00000000e+00
   2.01300000e+03 -8.58827028e-01]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  5.00000000e+00
   2.01300000e+03 -9.43872634e-01]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  5.00000000e+00
   2.01300000e+03 -1.05572590e+00]]
Nonzero elements: 4423441
Sparsity: 0.03%
Sample min: -2.379875516659317
Sample max: 2013.0


The dataset looks good, without NaNs, infs or large outliers.

We extract the transformed dataset into new files. The transformed dataset is very sparse, and therefore must be stored in .npz files, which are more efficient.

In [20]:
from scipy import sparse
sparse.save_npz('../data/X_train_sparse.npz', X_train)
sparse.save_npz('../data/X_test_sparse.npz', X_test)
np.save('../data/y_train.npy', y_train)