In [46]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [18]:
# parser v1
# parse records to list of dicts
# values mostly are strings
# easy to iterate through records
records = []
with open('test.txt') as f:
    for line in f.readlines():
        components = line.strip().split(' ')
        record = {}
        record['timestamp'] = components[0]
        record['displayed'] = components[1]
        record['clicked'] = components[2]
        record['user_features'] = []
        for i in range(4, len(components)):
            if components[i].startswith('|'):
                n = i
                break
            record['user_features'].append(int(components[i]))
        record['pool'] = components[n:]    
        records.append(record)

In [19]:
records[0]

{'clicked': '0',
 'displayed': 'id-560620',
 'pool': ['|id-552077',
  '|id-555224',
  '|id-555528',
  '|id-559744',
  '|id-559855',
  '|id-560290',
  '|id-560518',
  '|id-560620',
  '|id-563115',
  '|id-563582',
  '|id-563643',
  '|id-563787',
  '|id-563846',
  '|id-563938',
  '|id-564335',
  '|id-564418',
  '|id-564604',
  '|id-565364',
  '|id-565479',
  '|id-565515',
  '|id-565533',
  '|id-565561',
  '|id-565589',
  '|id-565648',
  '|id-565747',
  '|id-565822'],
 'timestamp': '1317513291',
 'user_features': [1,
  9,
  11,
  13,
  23,
  16,
  18,
  17,
  19,
  15,
  43,
  14,
  39,
  30,
  66,
  50,
  27,
  104,
  20]}

In [40]:
# parser v2
# parse records to dict of lists
# values are mostly strings
# easy to feed in pandas
records = {'timestamp':[],
          'displayed':[],
          'clicked':[],
          'user_features':[],
          'pool':[]}
with open('test.txt') as f:
    for line in f.readlines():
        components = line.strip().split(' ')
        records['timestamp'].append(components[0])
        records['displayed'].append(components[1])
        records['clicked'].append(components[2])
        user_features = []
        for i in range(4, len(components)):
            if components[i].startswith('|'):
                n = i
                break
            user_features.append(int(components[i]))
        records['user_features'].append(user_features)
        records['pool'].append(components[n:])

In [41]:
df = pd.DataFrame(records)
df.head()

Unnamed: 0,timestamp,displayed,clicked,user_features,pool
0,1317513291,id-560620,0,"[1, 9, 11, 13, 23, 16, 18, 17, 19, 15, 43, 14,...","[|id-552077, |id-555224, |id-555528, |id-55974..."
1,1317513291,id-565648,0,"[1, 9, 11, 13, 16, 15, 14]","[|id-552077, |id-555224, |id-555528, |id-55974..."
2,1317513291,id-563115,0,"[1, 8, 11, 36, 13, 22, 23, 16, 18, 54, 24, 26,...","[|id-552077, |id-555224, |id-555528, |id-55974..."
3,1317513292,id-552077,0,"[1, 7, 11, 37, 13, 23, 16, 18, 17, 35, 15, 14,...","[|id-552077, |id-555224, |id-555528, |id-55974..."
4,1317513292,id-564335,0,[1],"[|id-552077, |id-555224, |id-555528, |id-55974..."


In [62]:
# parser that watches article lifespans
lives = {}
with open('test.txt') as f:
    for line in f.readlines():
        components = line.strip().split(' ')
        for component in components:
            if component.startswith('|id-'):
                if component in lives:
                    lives[component].add(int(components[0]))
                else:
                    lives[component] = set()

In [63]:
lives

{'|id-552077': {1317513291,
  1317513292,
  1317513293,
  1317513294,
  1317513295,
  1317513296,
  1317513297,
  1317513298,
  1317513299},
 '|id-555224': {1317513291,
  1317513292,
  1317513293,
  1317513294,
  1317513295,
  1317513296,
  1317513297,
  1317513298,
  1317513299},
 '|id-555528': {1317513291,
  1317513292,
  1317513293,
  1317513294,
  1317513295,
  1317513296,
  1317513297,
  1317513298,
  1317513299},
 '|id-559744': {1317513291,
  1317513292,
  1317513293,
  1317513294,
  1317513295,
  1317513296,
  1317513297,
  1317513298,
  1317513299},
 '|id-559855': {1317513291,
  1317513292,
  1317513293,
  1317513294,
  1317513295,
  1317513296,
  1317513297,
  1317513298,
  1317513299},
 '|id-560290': {1317513291,
  1317513292,
  1317513293,
  1317513294,
  1317513295,
  1317513296,
  1317513297,
  1317513298,
  1317513299},
 '|id-560518': {1317513291,
  1317513292,
  1317513293,
  1317513294,
  1317513295,
  1317513296,
  1317513297,
  1317513298,
  1317513299},
 '|id-560620'