In [12]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import collections

In [7]:
INDEX_COLUMN = 'user_id'
SECS_ELAPSED_NUMERICAL = 'secs_elapsed'
CATEGORICAL_FEATURES = ['action', 'action_type', 'action_detail', 'device_type']
SESSSIONS_CSV_FILE = 'input/sessions.csv'
OUTPUT_TO_CSV_FILE = 'input/session_features.csv'

In [3]:
VALUE_THRESHOLD = 0.005

In [4]:
def extract_frequency_counts(pd_frame, column_list):
    """ Extract frequency counts from pd_frame.
    For each index (that correspond to a user) this method will count the
    number of times that C == Ci, where C is a column in column_list, and Ci
    is a unique value of that column. The arg olumn_list is assumed
    to contain categorical columns.
    Args:
        df_frame -- A pandas data frame.
        column_list -- A list of columns.
    Returns:
        A pandas DataFrame, containing frequency counts.
    """
    df_extracted_sessions = []
    for col in column_list:
        for val in set(pd_frame[col]):
            print 'Extracting frequency counts for (%s == %s)' % (col, val)
            tmp_df = pd_frame.groupby(pd_frame.index).apply(
                lambda group, x=col, y=val: np.sum(group[x] == y))
            tmp_df.name = '%s=%s' % (col, val)
            df_extracted_sessions.append(tmp_df)
    frequency_counts = pd.concat(df_extracted_sessions, axis=1)
    return frequency_counts


def extract_distribution_stats(pd_frame, numerical_col):
    """ Extract simple distribution statistics from a numerical column.
    Args:
        df_frame -- A pandas data frame.
        numerical_col -- A column in pd_frame that contains numerical values.
    Returns:
        A pandas DataFrame, containing simple satistics for col_name.
    """
    tmp_df = pd_frame[numerical_col].groupby(pd_frame.index).aggregate(
        [np.mean, np.std, np.median, stats.skew])
    tmp_df.columns = ['%s_%s'% (numerical_col, i) for i in tmp_df.columns]
    return tmp_df

In [10]:
def remove_rare_values_inplace(df_frame, column_list, threshold):
    """ Remove rare values to speed up computation.
    Args:
        df_frame -- A pandas data frame.
        column_list -- A list of columns.
        threshold -- The threshold, below which a value is removed.
    """
    insignificant_population = int(np.floor(threshold * len(df_frame)))
    for cat in column_list:
        freqs = collections.Counter(df_frame[cat])
        other = [i for i in freqs if freqs[i] < insignificant_population]
        for i in other:
            df_frame[cat].replace(i, 'other', inplace=True)

In [8]:
# Loading basic training and testing data, from CSV file.
sessions = pd.read_csv(SESSSIONS_CSV_FILE)
sessions.set_index(INDEX_COLUMN, inplace=True)
sessions.fillna(-1, inplace=True)

In [13]:
# Extract features from sessions.
remove_rare_values_inplace(sessions, CATEGORICAL_FEATURES, VALUE_THRESHOLD)
frequency_counts = extract_frequency_counts(sessions, CATEGORICAL_FEATURES)
simple_stats = extract_distribution_stats(sessions, SECS_ELAPSED_NUMERICAL)
# Save new data.
session_data = pd.concat((frequency_counts, simple_stats), axis=1)
session_data.fillna(-1, inplace=True)
session_data.to_csv(OUTPUT_TO_CSV_FILE)

Extracting frequency counts for (action == show)
Extracting frequency counts for (action == similar_listings)
Extracting frequency counts for (action == index)
Extracting frequency counts for (action == search_results)
Extracting frequency counts for (action == confirm_email)
Extracting frequency counts for (action == create)
Extracting frequency counts for (action == header_userpic)
Extracting frequency counts for (action == lookup)
Extracting frequency counts for (action == collections)
Extracting frequency counts for (action == requested)
Extracting frequency counts for (action == qt2)
Extracting frequency counts for (action == personalize)
Extracting frequency counts for (action == update)
Extracting frequency counts for (action == track_page_view)
Extracting frequency counts for (action == notifications)
Extracting frequency counts for (action == active)
Extracting frequency counts for (action == similar_listings_v2)
Extracting frequency counts for (action == identity)
Extracting 