In [2]:
import pandas as pd
import numpy as np
import re

In [32]:
file_name = "https://gist.githubusercontent.com/lexman/3548bf1f3c733fed7b1322139f67805b/raw/3dcdab5108d19f298bf163263cc4c1d93aeef649/tornik-map-20171006.10000.tsv"
file = pd.read_csv(file_name, sep="\t", header=None)
data = file.rename(columns={0:"Timestamp", 1:"log_file"})
print(data.head())

                 Timestamp                                           log_file
0  2017-10-05 00:01:09,676                /map/1.0/slab/traffic/256/9/260/177
1  2017-10-05 00:01:09,706       /map/1.0/slab/standard_hd/256/16/33495/22429
2  2017-10-05 00:01:09,714       /map/1.0/slab/standard_hd/256/16/32798/22739
3  2017-10-05 00:01:09,724  /wmts?LAYER=ORTHOIMAGERY.ORTHOPHOTOS&VERSION=1...
4  2017-10-05 00:01:09,724             /map/1.0/slab/photo/256/16/33728/23643


In [33]:
# Drom rows contains null values
data = data.dropna()

In [39]:
# Copy data from data to data1
data1 = data.copy()

In [40]:
# Get Log URLs having defined pattren 
data1 = data1[data1['log_file'].str.contains(r"\/map.*\/256(\/|$)")]
data1 = data1.reset_index().drop(columns=['index'])

  return func(self, *args, **kwargs)


In [41]:
data1

Unnamed: 0,Timestamp,log_file
0,"2017-10-05 00:01:09,676",/map/1.0/slab/traffic/256/9/260/177
1,"2017-10-05 00:01:09,706",/map/1.0/slab/standard_hd/256/16/33495/22429
2,"2017-10-05 00:01:09,714",/map/1.0/slab/standard_hd/256/16/32798/22739
3,"2017-10-05 00:01:09,724",/map/1.0/slab/photo/256/16/33728/23643
4,"2017-10-05 00:01:09,745",/map/1.0/slab/standard/256/19/263920/186677
...,...,...
7082,"2017-10-05 00:02:18,880",/map/1.0/slab/traffic/256/18/133137/88416
7083,"2017-10-05 00:02:18,884",/map/1.0/slab/traffic/256/18/133141/88415
7084,"2017-10-05 00:02:18,893",/map/1.0/slab/standard_hd/256/16/67949/46674
7085,"2017-10-05 00:02:18,900","/map/1.0/multi-descr/standard/256/14/8292,5644..."


In [42]:
# Split log URLs information
# Extract important information from log URLs and stored it in new columns named as "view_mode" and "zoom_level"
log_file_list = []
for i in data1["log_file"]:
  if len(i.split('/')) >= 6:
    d = dict()
    d["view_mode"] = i.split('/')[4]
    d["zoom_level"] = i.split('/')[6]
    log_file_list.append(d)

# New dataframe wth new added columns
new_df = pd.DataFrame().from_dict(log_file_list)
df = pd.concat([data1, new_df], axis=1, sort=False).reindex(data1.index)
df.head()

Unnamed: 0,Timestamp,log_file,view_mode,zoom_level
0,"2017-10-05 00:01:09,676",/map/1.0/slab/traffic/256/9/260/177,traffic,9
1,"2017-10-05 00:01:09,706",/map/1.0/slab/standard_hd/256/16/33495/22429,standard_hd,16
2,"2017-10-05 00:01:09,714",/map/1.0/slab/standard_hd/256/16/32798/22739,standard_hd,16
3,"2017-10-05 00:01:09,724",/map/1.0/slab/photo/256/16/33728/23643,photo,16
4,"2017-10-05 00:01:09,745",/map/1.0/slab/standard/256/19/263920/186677,standard,19


In [43]:
# List of Zoom level elements as per view mode consecutively
Vmode_Zlevel = []
preval = '<unknown>'
for curVmode, curZL, i in zip(df["view_mode"], df["zoom_level"], range(len(df["zoom_level"]))):
  
  if curVmode == preval:
    Vmode_Zlevel[-1].append(curZL)  
  else:
    Vmode_Zlevel.append([curZL])
    preval = curVmode
# Get unique list of zoom level 
unique_zoom_level = []
for i in Vmode_Zlevel:
  dup_free = list(set(i))
  unique_zoom_level.append(dup_free)

len(unique_zoom_level)

4636

In [44]:
# "groupby" method of itertools to group "view_mode" consecutively and get their count
from itertools import groupby
groups = []
uniquekeys = []

for i, j in groupby(df["view_mode"]):
  groups.append(len(list(j)))
  uniquekeys.append(i)

# Final dataframe with required information from log URLs
final_df = pd.DataFrame(columns=["view_mode", "count", "zoom_level"])
final_df["view_mode"] = uniquekeys
final_df["count"] = groups
final_df["zoom_level"] = [','.join(i) for i in unique_zoom_level] 

final_df.head(20)

Unnamed: 0,view_mode,count,zoom_level
0,traffic,1,9
1,standard_hd,2,16
2,photo,1,16
3,standard,2,1719
4,traffic_hd,1,12
5,standard,1,13
6,traffic,1,14
7,standard,1,19
8,traffic,1,14
9,photo,1,17


In [45]:
# Saved file in tabular format
parsed_logs_file = final_df.to_csv('parsed_logs_file.tsv', sep = '\t', index=False)