# Analyzing IICW3C logs
python environment, functions, and examples
<br>
Sam Gartrell | 6/30/23
<br>

## environment
- Use conda to clone the environment stored in `environment.yml`
    ```
    conda env create -f environment.yml
    conda activate iisw3c-tool
    ```

- if using this notebook, set the kernel to 'parse-iis'

- you will also need a .env file with credentials for the `geoip2` geocoder. You can set up a free account [here](maxmind.com/en/geoip2-databases)

In [1]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
import numpy as np
import requests as r
import pprint
import time
from geoip2 import webservice, errors
from dotenv import load_dotenv
import os
import matplotlib.pyplot as plt
from scipy.stats import norm
import statistics
from user_agents import parse

if load_dotenv():
    print('env variables loaded')

env variables loaded


## functions
Use these functions to access and analyze data stored in local log files. Recommend first developing analysis on a single log, then expanding it to target the month or set of months.

In [3]:
def log_to_df(
        log_path:str,
        fields:list=['date', 'time', 's-ip', 'cs-method', 'cs-uri-stem', 'cs-uri-query', 's-port', 'cs-username', 'c-ip', 'cs(User-Agent)', 'cs(Referer)', 'sc-status', 'sc-substatus', 'sc-win32-status', 'sc-bytes', 'time-taken']
        ):
    """
    loads a log file into a pandas dataframe
    """
    # Read the log file into a DataFrame
    df = pd.read_csv(log_path, sep=' ', header=None, names=fields, skiprows=4)

    # add a unique ID to dataframe so it can be cut into subsets
    df['uid'] = range(1, len(df) + 1)

    return df

def logs_to_df(
        folder_path:str,
        fields:list=['date', 'time', 's-ip', 'cs-method', 'cs-uri-stem', 'cs-uri-query', 's-port', 'cs-username', 'c-ip', 'cs(User-Agent)', 'cs(Referer)', 'sc-status', 'sc-substatus', 'sc-win32-status', 'sc-bytes', 'time-taken'],
        filter_mm_rlis=False):
    """
    loads a folder of log files into a single pandas dataframe. all log files must have the same fields, and follow the ISSW3C spec.
    """
    start = time.time()

    # initialize list for per-log dataframes, later flattened
    dfs = []

    # for each log file in the target folder,
    for file in os.listdir(folder_path):
        if file.endswith('.log'):
            print(f'\t\tloading {file} to dataframe')
            log = os.path.join(folder_path, file)
            
            # read it into a dataframe (ignore 1st 4 lines, in keeping with ISSW3C format)
            # add it to the list
            if filter_mm_rlis:
                df = filter_metromaps_rlis(
                    pd.read_csv(log, sep=' ', header=None, names=fields, skiprows=4)
                )
            else:
                df = pd.read_csv(log, sep=' ', header=None, names=fields, skiprows=4)

            dfs.append(df)
            # print(file, "read successfully")

    # flatten the list
    folder_df = pd.concat(dfs, axis=0, ignore_index=True)

    # add a unique ID to dataframe so it can be cut into subsets
    folder_df['uid'] = range(1, len(folder_df) + 1)


    print(f'took {time.time() - start} sec')
    return folder_df

def geocode_ip(dataframe: pd.DataFrame, sample_size: int = 100):
    """
    returns a geodataframe of points representing IP addresses from the dataframe passed.
    Works well with a df that's already grouped by IP address, to maximize the number of unique IPs sampled and geocoded.
    only 1000 requests/day are permitted; consider upgrading to a paid svc 
    """
    # geocoder api stuff
    key = os.getenv('GEOIP2_KEY') # api key
    aid = os.getenv('GEOIP2_AID') # acc id

    # get every nth record to reduce burden on geocoder
    subset = dataframe.sample(n=sample_size)
    subset['lat'] = ""
    subset['lon'] = ""
    subset['reserved_ip'] = False

    # for each row, geocode its ip
    with webservice.Client(aid, key, host='geolite.info') as client:
        for row_index, row in subset.iterrows():
            try:
                response = client.city(row['c-ip'])
                lon = response.location.longitude
                lat = response.location.latitude
                subset.loc[row_index, ['lon', 'lat']] = [lon, lat]
            except errors.AddressNotFoundError:
                subset.loc[row_index, 'reserved_ip'] = True

    spatial_subset = subset[subset['reserved_ip'] == False]
    geo = [
        Point(lon, lat) for
            lon, lat in 
                zip(
                    spatial_subset['lon'],
                    spatial_subset['lat']
                )]
    
    return gpd.GeoDataFrame(spatial_subset, geometry=geo)


def parse_user_agent(user_agent):
    ua = parse(user_agent)
    
    device = {
        'brand': ua.device.brand,
        'model': ua.device.model,
        'os': f'{ua.os.family} {ua.os.version_string}'
    }
    
    browser = {
        'family': ua.browser.family,
        'version': ua.browser.version_string
    }
    
    is_bot = any(keyword in user_agent.lower() for keyword in ['bot', 'crawler', 'spider'])
    
    return {
        'agent_str': user_agent,
        'device': device,
        'browser': browser,
        'is_bot': is_bot
    }


def batch_stats(function, log_dir=r'd:/metro/logs', just_mm_rlis=False):
    """
    iteratively applies a stats function to every month in the log folder, reporting results
    """


    for i in os.listdir(log_dir):
        print(i)
        if i.endswith('.zip') or i.endswith('.txt'):
            pass
        else:
            # make the log for the month
            month = logs_to_df(os.path.join(log_dir, i), filter_mm_rlis=just_mm_rlis)

            # get and view ip stats
            stats = function(month)
            print(
                i.split('/')[-1],
                pprint.pformat(stats)
            )
            month = None
            print('__________________________________________')

def ip_request_statistics(dataframe:pd.DataFrame, use_log=True):
    """
    computes mean, standard deviation of request counts by IP,
    and gives z-score of most prolific IP (base-2 logarithm-transformed if use_log==True),
    for the dataframe passed
    """
    # Calculate the request count per IP address (log-transformed)
    request_counts = dataframe['c-ip'].value_counts()
    if use_log:
        request_counts = np.log2(
            request_counts
        )
        

    # Calculate the mean and standard deviation of request counts
    mean_requests = request_counts.mean()
    std_requests = request_counts.std()

    # Identify the most prolific IP address
    most_prolific_ip = request_counts.idxmax()

    # Calculate the z-score for the most prolific IP address
    z_score = (request_counts[most_prolific_ip] - mean_requests) / std_requests

    return {
        # the mean number of requests coming from a single IP
        'mean_request_count': mean_requests,

        # the dispersion of the above
        'stdev_request_count': std_requests,

        # top 10 most prolific IP addresses
        'top_10': list(dataframe['c-ip'].value_counts().head(10).index),

        # the number of unique IP addresses
        'n_unique': dataframe['c-ip'].nunique(),

        # address, request count, and z-score (prominence) of most prolific IP in the dataframe
        'top': {
            'address': most_prolific_ip,
            'request_count': request_counts.max(),
            'z_score': z_score
        },

        # wether or not these statistics come from logarithm-transformed data
        'used_log': use_log
    }

def filter_services(dataframe):
    '''
    metro-specific: get insights on /services/ endpoints that aren't from ArcGIS rest and are referred froim metromaps referrers
    '''
    return dataframe[
    (dataframe['cs(Referer)'].str.contains('metromap')) &
    (dataframe['cs-uri-stem'].str.contains('/services/')) &
    (~dataframe['cs-uri-stem'].str.contains('/ArcGIS/')) &
    (~dataframe['cs-uri-stem'].str.contains('/arcgis/'))
    ]

def parse_uri_stem(row):
    '''
    used in append_uri_path to break apart URI stem into a list, given a series with appropriate stem field'''
    path = row['cs-uri-stem']
    if pd.isnull(path):
        return [None, None, None, None]
    else:
        split_values = path.split('/')
        if len(split_values) >= 5:
            return split_values[:5]
        else:
            split_values += [None] * (5 - len(split_values))
            return split_values
        
def append_uri_path(dataframe):
    '''
    appends first 5 elements of a uri stem as separate fields to the end of a dataframe, for each row
    '''

    # Apply parse_uri_stem function to 'cs-uri-stem' column and store the results
    split_values = dataframe.apply(parse_uri_stem, axis=1)

    # Create new columns for the first five values in the split_values DataFrame
    dataframe['1st_value'] = split_values.apply(lambda x: x[0])
    dataframe['2nd_value'] = split_values.apply(lambda x: x[1])
    dataframe['3rd_value'] = split_values.apply(lambda x: x[2])
    dataframe['4th_value'] = split_values.apply(lambda x: x[3])
    dataframe['5th_value'] = split_values.apply(lambda x: x[4])
    
    return dataframe

def filter_metromaps_rlis(dataframe):
    return dataframe[
    (dataframe['cs-uri-stem'].str.contains('rlisapi')) & (dataframe['cs(Referer)'].str.contains('metromap'))
    ]

def user_agent_statistics(dataframe:pd.DataFrame, use_log:bool=True):
    """
    looks at the distribution and characteristics of User Agent info,
    and gives z-score of most prolific IP (base-2 logarithm-transformed if use_log)
    """
    # Calculate the request count per IP address (log-transformed)
    user_agent_counts = dataframe['cs(User-Agent)'].value_counts()
    print('user agent counts\n', user_agent_counts)
    if use_log:
        user_agent_counts = np.log2(
            user_agent_counts
        )
        

    # calculate the average number of requests associated with a distinct user agent
    mean_requests = user_agent_counts.mean()

    # get dispersion of that statistic via stdev
    std_requests = user_agent_counts.std()

    # Identify the most common user agent (TODO: generalize by browser or ???)
    most_prolific_ua = user_agent_counts.idxmax()

    # Calculate the z-score for the most common user agent
    z_score = (user_agent_counts[most_prolific_ua] - mean_requests) / std_requests

    # check for suspish user agent strings
    top10 = []
    for agent in list(user_agent_counts.head(10).index):
        top10.append(parse_user_agent(agent))

    return {
        # average number of requests associated with a distinct user agent
        'mean_request_count': mean_requests,

        # dispersion of that statistic via stdev
        'stdev_request_count': std_requests,

        # top 10 user agents
        'top_10': top10,
        'n_unique': dataframe['c-ip'].nunique(),
        'top': {
            'user_agent': most_prolific_ua,
            'request_count': user_agent_counts.max(),
            'z_score': z_score
        },
        'used_log': use_log
    }


### Example: view and visualize request statistics by IP in a log file

In [None]:
# load log file for a given day
oct4 = log_to_df(r'd:/metro/logs/202210/u_ex221004.log')

# get and view ip stats
stats = ip_request_statistics(oct4)
print(
    pprint.pformat(stats)
)

In [None]:
oc4_mm = oct4[
    (oct4['cs-uri-stem'].str.contains('rlisapi')) & (oct4['cs(Referer)'].str.contains('metromap'))
    ]
oc4_mm['cs-uri-stem']



In [None]:
# look at log-transformed request counts per IP stats
x_axis = np.arange(
    0,
    stats['top']['request_count']
)

plt.plot(
    x_axis,
    norm.pdf(x_axis, stats['mean_request_count'],
             stats['stdev_request_count'])
)

plt.title('log-transformed distribution of requests per IP address\n(most prolific IP request count in red)')

# add most prolific IP
plt.axvline(x=stats['top']['request_count'], color='r', linestyle='--')

In [None]:
# standardize distribution to look at z-score
x_axis = np.arange(-3, 3, 0.001)
plt.plot(x_axis, norm.pdf(x_axis, 0, 1))
plt.title('z-score of highest request count (red)')

# adding z-score
plt.axvline(x=stats['top']['z_score'], color='r', linestyle='--')

In [None]:
oct4_geocoded = geocode_ip(oct4, sample_size=10)

In [None]:
# who's comin to the site from where?
fig, ax = plt.subplots(figsize=(16, 16))



world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))

world.plot(ax=ax, color='#ffffff', edgecolor='#6a6a6a', linewidth=.5)
oct4_geocoded.plot(ax=ax, color='red', markersize=1)
_=ax.axis('off')

# Show the plot
plt.show()

In [None]:
def check_svc_layer(dataframe):

    svcs = filter_services(dataframe)
    svcs_parsed = append_uri_path(svcs)

    return {
        '3rd value counts': svcs_parsed['3rd_value'].value_counts(),
        '4th value counts': svcs_parsed['4th_value'].value_counts()
    }

# batch_stats(check_svc_layer)
# df = logs_to_df(r'D:\metro\logs\202211')
# check_svc_layer(df)

batch_stats(check_svc_layer)

In [4]:
batch_stats(function=user_agent_statistics, just_mm_rlis=True)

202210.zip
202211.zip
202302.zip
202303.zip
202304.zip
drive-download-20230623T201911Z-001.zip
202210
		loading u_ex221001.log to dataframe
		loading u_ex221002.log to dataframe
		loading u_ex221003.log to dataframe
		loading u_ex221004.log to dataframe
		loading u_ex221005.log to dataframe
		loading u_ex221006.log to dataframe
		loading u_ex221007.log to dataframe
		loading u_ex221008.log to dataframe
		loading u_ex221009.log to dataframe
		loading u_ex221010.log to dataframe
		loading u_ex221011.log to dataframe
		loading u_ex221012.log to dataframe
		loading u_ex221013.log to dataframe
		loading u_ex221014.log to dataframe
		loading u_ex221015.log to dataframe
		loading u_ex221016.log to dataframe
		loading u_ex221017.log to dataframe
		loading u_ex221018.log to dataframe
		loading u_ex221019.log to dataframe
		loading u_ex221020.log to dataframe
		loading u_ex221021.log to dataframe
		loading u_ex221022.log to dataframe
		loading u_ex221023.log to dataframe
		loading u_ex221024.log

In [17]:
# get metromaps/rlis specific df
df = logs_to_df(r'd:/metro/logs/202211')

# df = df.head(10)

# apply the parse user agent func and ascribe is_bot, browser, os fields to dataframe
df[['is_bot']] = df['cs(User-Agent)'].apply(lambda x: pd.Series(parse_user_agent(x)['is_bot']))
df[['os']] = df['cs(User-Agent)'].apply(lambda x: pd.Series(parse_user_agent(x)['device']['os']))
df[['browser']] = df['cs(User-Agent)'].apply(lambda x: pd.Series(parse_user_agent(x)['browser']['family']))

# check value counts

		loading u_ex221101.log to dataframe
		loading u_ex221102.log to dataframe
		loading u_ex221103.log to dataframe
		loading u_ex221104.log to dataframe
		loading u_ex221105.log to dataframe
		loading u_ex221106.log to dataframe
		loading u_ex221107.log to dataframe
		loading u_ex221108.log to dataframe
		loading u_ex221109.log to dataframe
		loading u_ex221110.log to dataframe
		loading u_ex221111.log to dataframe
		loading u_ex221112.log to dataframe
		loading u_ex221113.log to dataframe
		loading u_ex221114.log to dataframe
		loading u_ex221115.log to dataframe
		loading u_ex221116.log to dataframe
		loading u_ex221117.log to dataframe
		loading u_ex221118.log to dataframe
		loading u_ex221119.log to dataframe
		loading u_ex221120.log to dataframe
		loading u_ex221121.log to dataframe
		loading u_ex221122.log to dataframe
		loading u_ex221123.log to dataframe
		loading u_ex221124.log to dataframe
		loading u_ex221125.log to dataframe
		loading u_ex221126.log to dataframe
		loading u_

In [16]:
print(len(df))

print(df['is_bot'].value_counts())
print(df['browser'].value_counts())
print(df['os']).value_counts()

10
is_bot
False    7
True     3
Name: count, dtype: int64
browser
Other        6
Googlebot    3
Chrome       1
Name: count, dtype: int64
0    Windows 
1      Other 
2      Other 
3      Other 
4      Other 
5      Other 
6      Other 
7      Other 
8      Other 
9      Other 
Name: os, dtype: object


AttributeError: 'NoneType' object has no attribute 'value_counts'

In [None]:
# get metromaps/rlis specific df
df = logs_to_df(r'd:/metro/logs/202211', filter_mm_rlis=True)

# df = df.head(10)

# apply the parse user agent func and ascribe is_bot, browser, os fields to dataframe
df[['is_bot']] = df['cs(User-Agent)'].apply(lambda x: pd.Series(parse_user_agent(x)['is_bot']))
df[['os']] = df['cs(User-Agent)'].apply(lambda x: pd.Series(parse_user_agent(x)['device']['os']))
df[['browser']] = df['cs(User-Agent)'].apply(lambda x: pd.Series(parse_user_agent(x)['browser']['family']))

# check value counts

		loading u_ex221101.log to dataframe
		loading u_ex221102.log to dataframe
		loading u_ex221103.log to dataframe
		loading u_ex221104.log to dataframe
		loading u_ex221105.log to dataframe
		loading u_ex221106.log to dataframe
		loading u_ex221107.log to dataframe
		loading u_ex221108.log to dataframe
		loading u_ex221109.log to dataframe
		loading u_ex221110.log to dataframe
		loading u_ex221111.log to dataframe
		loading u_ex221112.log to dataframe
		loading u_ex221113.log to dataframe
		loading u_ex221114.log to dataframe
		loading u_ex221115.log to dataframe
		loading u_ex221116.log to dataframe
		loading u_ex221117.log to dataframe
		loading u_ex221118.log to dataframe
		loading u_ex221119.log to dataframe
		loading u_ex221120.log to dataframe
		loading u_ex221121.log to dataframe
		loading u_ex221122.log to dataframe
		loading u_ex221123.log to dataframe
		loading u_ex221124.log to dataframe
		loading u_ex221125.log to dataframe
		loading u_ex221126.log to dataframe
		loading u_