# Loading Web Data from 7/19 - 10/20, and Filtering to Interactions with our LA listings

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.set_option('display.max_rows', None)

#### Import webdata from shared drive

In [None]:
# import gdown

# url = '---'
# output = 'webevents_info.zip'
# gdown.download(url, output, quiet=False) 

In [None]:
# !unzip webevents_info.zip -d web_events_full

In [None]:
import os
foldername = 'webevents_info'
filenames = [os.path.join('webevents_info',f) for f in os.listdir(foldername)]

In [None]:
# see how many files the webdata was split into
len(filenames)

50

### Matching the incoming webevents data to the list of RexUrls that we have full metadata for in LA

In [None]:
our_rexurls = pd.read_csv('list_of_rexurls.csv')['rexUrl']

In [None]:
our_rexurls.head()

0       rex1-pacific-coast-hwy
1    4080-glencoe-ave-unit-316
2               175-sequoia-dr
3                 6560-3rd-ave
4        1258-barry-ave-unit-5
Name: rexUrl, dtype: object

In [None]:
# Reading in web data, filtering it, and combining it
all_data_list = []
for i, file_i in enumerate(filenames):
  csv_curr = pd.read_csv(file_i)
  csv_curr_filtered = csv_curr[csv_curr.rexurl.isin(our_rexurls)].copy()
  
  all_data_list.append(csv_curr_filtered)
  print(csv_curr_filtered.shape, ':', i+1,'out of',len(filenames))

In [None]:
all_webdata = pd.concat(all_data_list)

#### Inspect details of final dataframe

In [None]:
all_webdata.shape

(5155882, 5)

In [None]:
min(all_webdata.event_stamp)

'2019-07-01 00:03:24.181'

In [None]:
max(all_webdata.event_stamp)

'2020-10-31 23:53:31.664'

In [None]:
# Sort by ip and time
all_webdata.reset_index(drop=True,inplace=True)
all_webdata.sort_values(['ip','event_stamp'],inplace=True)

In [None]:
# Convert to datetime
all_webdata['event_stamp'] = pd.to_datetime(all_webdata['event_stamp'])

In [None]:
all_webdata.head(30)

In [None]:
all_webdata.to_pickle('all_webdata.pkl')

## Filtering to user-listing viewings that were at least 15s

In [None]:
duration_data = all_webdata.groupby(['ip','rexurl']).apply(lambda x: (max(x['event_stamp']) - min(x['event_stamp'])).total_seconds())

In [None]:
duration_data_df = duration_data.reset_index()
duration_data_df.rename({0:'rexUrl_duration'},axis=1,inplace=True)

In [None]:
all_webdata2 = all_webdata.merge(duration_data_df,on=['ip','rexurl'],how='left')

In [None]:
all_webdata_filtered = all_webdata2[all_webdata2.rexUrl_duration>=15].copy()

In [None]:
print(all_webdata.shape)
print(all_webdata_filtered.shape)

(5155882, 5)
(4617883, 6)


In [None]:
all_webdata_filtered.to_pickle('all_webdata_filtered.pkl')