In [32]:
import pandas as pd
import requests as r
import shutil
import time
from geoip2 import webservice
from dotenv import load_dotenv
import os

if load_dotenv():
    print('env variables loaded')

env variables loaded


In [29]:
def log_to_df(
        log_path:str,
        fields:list=['date', 'time', 's-ip', 'cs-method', 'cs-uri-stem', 'cs-uri-query', 's-port', 'cs-username', 'c-ip', 'cs(User-Agent)', 'cs(Referer)', 'sc-status', 'sc-substatus', 'sc-win32-status', 'sc-bytes', 'time-taken']
        ):
    """
    loads a log file into a pandas dataframe
    """
    # ditch commented lines at the beginning of the log file
    with open(log_path, 'r+') as f:
        lines = f.readlines()
        f.seek(0)
        for line in lines[4:]:
            f.write(line)
        f.truncate()

    # Read the log file into a DataFrame
    df = pd.read_csv(log_path, sep=' ', header=None, names=fields)

    # add a unique ID to dataframe so it can be cut into subsets
    df['uid'] = range(1, len(df) + 1)

    return df

def logs_to_df(
        folder_path:str,
        fields:list=['date', 'time', 's-ip', 'cs-method', 'cs-uri-stem', 'cs-uri-query', 's-port', 'cs-username', 'c-ip', 'cs(User-Agent)', 'cs(Referer)', 'sc-status', 'sc-substatus', 'sc-win32-status', 'sc-bytes', 'time-taken']
        ):
    """
    loads a folder of log files into a single pandas dataframe. all log files must have the same fields, and follow the ISSW3C spec.
    """
    start = time.time()

    # initialize list for per-log dataframes, later flattened
    dfs = []

    # for each log file in the target folder,
    for file in os.listdir(folder_path):
        if file.endswith('.log'):
            log = os.path.join(folder_path, file)
            
            # read it as a dataframe (ignore 1st 4 lines, in keeping with ISSW3C format)
            dfs.append(pd.read_csv(log, sep=' ', header=None, names=fields, skiprows=4))

    folder_df = pd.concat(dfs, axis=0, ignore_index=True)
    print(f'took {time.time() - start} sec')
    return folder_df

# find ip metrics for a log dataframe
def get_ip_metrics(dataframe:pd.DataFrame):
    return {
    'most_common': dataframe['c-ip'].mode()[0],
    'top_10': list(dataframe['c-ip'].value_counts().head(10).index),
    'n_unique': dataframe['c-ip'].nunique()
}

In [30]:
oct22 = logs_to_df(r'd:/metro/logs/202210/')
oct22

log u_ex221001.log copied to df (16 columns, 194485 records)


log u_ex221002.log copied to df (16 columns, 153181 records)


log u_ex221003.log copied to df (16 columns, 578787 records)


log u_ex221004.log copied to df (16 columns, 549894 records)


log u_ex221005.log copied to df (16 columns, 619878 records)


log u_ex221006.log copied to df (16 columns, 522778 records)


log u_ex221007.log copied to df (16 columns, 460365 records)


log u_ex221008.log copied to df (16 columns, 188233 records)


log u_ex221009.log copied to df (16 columns, 142974 records)


log u_ex221010.log copied to df (16 columns, 628231 records)


log u_ex221011.log copied to df (16 columns, 596372 records)


log u_ex221012.log copied to df (16 columns, 542873 records)


log u_ex221013.log copied to df (16 columns, 526142 records)


log u_ex221014.log copied to df (16 columns, 424556 records)


log u_ex221015.log copied to df (16 columns, 162968 records)


log u_ex221016.log copied to df (16 columns, 178375 rec

Unnamed: 0,date,time,s-ip,cs-method,cs-uri-stem,cs-uri-query,s-port,cs-username,c-ip,cs(User-Agent),cs(Referer),sc-status,sc-substatus,sc-win32-status,sc-bytes,time-taken
0,2022-10-01,07:00:02,10.50.40.56,GET,/arcgis/rest/info,f=json,443,-,173.8.217.9,Mozilla/5.0+(Macintosh;+Intel+Mac+OS+X+10_15_7...,https://apps.intterragroup.com/,200,0,0,693,812
1,2022-10-01,07:00:23,10.50.40.56,GET,/arcgis/rest/services/photo/aerialphoto/MapServer,f=json&token=_ynehhLkr2PLl4SWzl6HFe8RQWLQJBRsT...,443,-,54.203.89.18,-,https://apps.intterragroup.com/,200,0,0,5605,20109
2,2022-10-01,07:00:26,10.50.40.56,GET,/arcgis/rest/services/basemap_metro_region/Map...,f=json&dpi=96&transparent=true&format=png8&tok...,443,-,54.203.89.18,-,https://apps.intterragroup.com/,200,0,0,4453,23906
3,2022-10-01,07:00:28,10.50.40.56,GET,/rlisapi2/js/,token=B8N3HMtBcyNXzSQtvIlJ8DKQUKwyQZRDbtPIyK71...,443,-,73.25.165.81,Mozilla/5.0+(Android+11;+Mobile;+rv:105.0)+Gec...,https://www.oregonmetro.gov/,200,0,0,23690,249
4,2022-10-01,07:00:29,10.50.40.56,GET,/arcgis/rest/services/RHIC/Metro_High_Injury_c...,f=pjson,443,-,185.191.171.42,Mozilla/5.0+(compatible;+SemrushBot/7~bl;++htt...,-,200,0,0,2069,5390
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25444627,2022-11-01,06:58:37,10.50.40.56,GET,/arcgis/rest/services/taxlots/MapServer/legend,f=json&token=B98GdhPZcAQggd2ecYDSxOU6-xiC9a7Cf...,443,-,34.219.108.240,-,https://apps.intterragroup.com/,200,0,0,1301,78
25444628,2022-11-01,06:58:37,10.50.40.56,GET,/arcgis/rest/services/basemap_metro_region/Map...,f=json&token=B98GdhPZcAQggd2ecYDSxOU6-xiC9a7Cf...,443,-,34.219.108.240,-,https://apps.intterragroup.com/,200,0,0,1189,109
25444629,2022-11-01,06:58:55,10.50.40.56,GET,/,-,443,-,66.249.79.40,Mozilla/5.0+(compatible;+Googlebot/2.1;++http:...,-,304,0,0,245,250
25444630,2022-11-01,06:58:57,10.50.40.56,GET,/script.js,-,443,-,66.249.79.40,"Mozilla/5.0+AppleWebKit/537.36+(KHTML,+like+Ge...",https://gis.oregonmetro.gov/,200,0,0,122413,250


In [None]:
# find the proportion of stems relevant to metromap (actual usages)
mm_stem = df[df['cs-uri-stem'].str.contains('metromap')]
mm_stem_proportion = len(mm_stem) / len(df)
print(mm_stem_proportion)

# identify requests referred FROM metromap
mm_rfrr = df[df['cs(Referer)'].str.contains('metromap')]
mm_rfrr_proportion = len(mm_rfrr) / len(df)
print(mm_rfrr_proportion)

# identify unsecured ('http://') metromap usages
mm_rfrr_unsec = mm_rfrr

In [None]:
# find ip metrics for a log dataframe
def get_ip_metrics(dataframe:pd.DataFrame):
    return {
    'most_common': dataframe['c-ip'].mode()[0],
    'top_10': list(dataframe['c-ip'].value_counts().head(10).index),
    'n_unique': dataframe['c-ip'].nunique()
}

get_ip_metrics(mm_stem)
get_ip_metrics(mm_rfrr)

In [None]:
def geocode_ip(dataframe: pd.DataFrame, sample_size: int = 100):
    """
    only 1000 requests/day are permitted; consider upgrading to a paid svc
    """
    # geocoder api stuff
    key = os.getenv('GEOIP2_KEY') # api key
    aid = os.getenv('GEOIP2_AID') # acc id

    # get every nth record to reduce burden on geocoder
    subset = dataframe.sample(n=sample_size)
    subset['lat'] = ""
    subset['lon'] = ""
    subset['reserved_ip'] = False

    # for each row, geocode its ip
    with geoip2.webservice.Client(aid, key, host='geolite.info') as client:
        for row_index, row in subset.iterrows():
            try:
                response = client.city(row['c-ip'])
                lon = response.location.longitude
                lat = response.location.latitude
                subset.loc[row_index, ['lon', 'lat']] = [lon, lat]
            except geoip2.AddressNotFoundError:
                subset.loc[row_index, 'reserved_ip'] = True

    return subset

         
mm_stem_geocoded = geocode_ip(mm_stem, sample_size=10)

In [None]:
mm_stem_geocoded

In [None]:
# look at User Agent info. Assumes that ios, Mozilla