# Imports

In [3]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

import re
import os
import pickle

# Data reading and cleaning

In [31]:
DATA_DIR = 'data'
FILE_NAME = 'train_1'
CSV_PATH = os.path.join(DATA_DIR, FILE_NAME + '.csv')
PKL_PATH = os.path.join(DATA_DIR, FILE_NAME + '.pkl')

def load_data() -> pd.DataFrame:
    '''
    Loads data from path. If there is a cached version loads it instead.
    '''
    if os.path.exists(PKL_PATH):
        print('Loading pickle...')
        df = pd.read_pickle(PKL_PATH)
        print('Done!')
        return df
    else:
        print('Loading csv...')
        df = pd.read_csv(CSV_PATH)
        df.to_pickle(PKL_PATH)
        print('Done!')
        return df

In [32]:
def read_all() -> pd.DataFrame:
    '''
    Loads data, sets index for the df and makes columns a date type.
    Also pickles for speed increase
    '''
    
    treated_pkl = os.path.join(DATA_DIR, 'treated.pkl')
    if os.path.exists(treated_pkl):
        df = pd.read_pickle(treated_pkl)
    else:
        df = load_data()
        df.set_index('Page', inplace=True)
        df.sort_index(inplace=True)
        df.columns = df.columns.astype('M8[D]')
        print('Pickling treated data...')
        df.to_pickle(treated_pkl)
        print('Done!')
    return df

In [33]:
def read_interval(start, end) -> pd.DataFrame:
    '''
    Returns dataframe within specified values: ts[start:end]
    '''
    df = read_all()
    if start and end:
        return df.loc[:, start:end]
    elif end:
        return df.loc[:, :end]
    else:
        return df

In [34]:
df = read_interval(None, None)

Loading csv...
Done!
Pickling treated data...
Done!


In [35]:
df.head()

Unnamed: 0_level_0,2015-07-01 00:00:00,2015-07-02 00:00:00,2015-07-03 00:00:00,2015-07-04 00:00:00,2015-07-05 00:00:00,2015-07-06 00:00:00,2015-07-07 00:00:00,2015-07-08 00:00:00,2015-07-09 00:00:00,2015-07-10 00:00:00,...,2016-12-22 00:00:00,2016-12-23 00:00:00,2016-12-24 00:00:00,2016-12-25 00:00:00,2016-12-26 00:00:00,2016-12-27 00:00:00,2016-12-28 00:00:00,2016-12-29 00:00:00,2016-12-30 00:00:00,2016-12-31 00:00:00
Page,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
!vote_en.wikipedia.org_all-access_all-agents,3.0,4.0,7.0,4.0,4.0,2.0,3.0,7.0,2.0,,...,3.0,1.0,6.0,3.0,1.0,1.0,4.0,3.0,1.0,1.0
!vote_en.wikipedia.org_all-access_spider,0.0,0.0,1.0,2.0,0.0,1.0,0.0,0.0,2.0,,...,2.0,0.0,2.0,2.0,0.0,0.0,1.0,1.0,1.0,0.0
!vote_en.wikipedia.org_desktop_all-agents,3.0,4.0,7.0,4.0,4.0,2.0,3.0,7.0,2.0,,...,3.0,1.0,6.0,3.0,1.0,1.0,3.0,3.0,1.0,0.0
"""Awaken,_My_Love!""_en.wikipedia.org_all-access_all-agents",,,,,,,,,,,...,4923.0,5074.0,4149.0,4209.0,3783.0,3994.0,3910.0,4006.0,3841.0,3517.0
"""Awaken,_My_Love!""_en.wikipedia.org_all-access_spider",,,,,,,,,,,...,89.0,12.0,63.0,15.0,38.0,14.0,59.0,40.0,35.0,45.0


In [91]:
df.shape

(145063, 550)

16 mins with dataframe access <br>
6 segs with numpy arrays <br>
289ms with numba <br>

In [98]:
%%time
import time
import numba

@numba.jit(nopython=True)
def calculate_start_end(data: np.ndarray):
    '''
    Calculates start and end of series.
    Start = first non nan value
    End = last non nan value
    '''
    rows, columns = data.shape[0], data.shape[1]
    
    # array where idx will be stored. if none is found, defaults to -1
    start_idx = np.full(rows, -1, dtype=np.int32)
    end_idx = np.full(rows, -1, dtype=np.int32)
    
    for page in range(rows):
        #start idx
        for day in range(columns):
            # if nan 1, get index for 1
            if not np.isnan(data[page, day]) and data[page, day] > 0:
                start_idx[page] = day
                break
        for day in range(columns -1, -1, -1):
            if not np.isnan(data[page, day]) and data[page, day] > 0:
                end_idx[page] = day
                break
        
    return start_idx, end_idx

start, end = calculate_start_end(df.values)



CPU times: user 291 ms, sys: 0 ns, total: 291 ms
Wall time: 289 ms


In [105]:
def clean_data(threshold, start=None, end=None):
    '''
    Loads data, setting Page as index, and columns as datetime dtypes.
    Removes series that don't comply to minimum threshold of nan to value ratio
    Returns normalized series (log1p), indexes of previously nan values, start and end indexes
    '''
    df = read_interval(start, end)
    start, end = calculate_start_end(df.values)
    bool_mask = ~(((end - start) / df.shape[1]) < threshold)
    df = df[bool_mask]
    
    nan_values = pd.isnull(df)
    return np.log1p(df.fillna(0)), nan_values, start, end

In [106]:
normalized, nans, start, end = clean_data(0.3)

In [111]:
df.shape[0] - normalized.shape[0]

4929

In [114]:
end

array([549, 548, 548, ..., 549, 549, 549], dtype=int32)

# Extracting features from URL

In [None]:
pat = re.compile(
    '(.+)_([a-z]{2}\.)?((?:wikipedia\.org)|(?:commons\.wikimedia\.org)|(?:www\.wikimedia\.org)|(?:www\.mediawiki\.org))_([a-z-]+?)_([a-z-]+)$'
)

def extract_from_url(urls):
    '''
    receives pandas dataframe column or series
    returns a pandas dataframe with all the extracted features
    '''
    
    if isinstance(urls, pd.Series):
         urls = urls.values
    
    accesses = np.full_like(urls, np.NaN)
    agents = np.full_like(urls, np.NaN)
    sites = np.full_like(urls, np.NaN)
    countries = np.full_like(urls, np.NaN)
    titles = np.full_like(urls, np.nan)
    
    for i in range(len(urls)):
        url = urls[i]
        match = pat.fullmatch(url)
        assert match, "regex pattern matching failed %s" % url
        
        titles[i] = match.group(1)
        
        country = match.group(2)
        if country:
            countries[i] = country[:-1]
        else:
            countries[i] = 'na'
            
        sites[i] = match.group(3)
        
        agents[i] = match.group(4)
        accesses[i] = match.group(5)
        
    return pd.DataFrame({
        'page': urls,
        'title': titles,
        'agent': agents,
        'access': accesses,
        'site': sites,
        'country': countries,
    })