In [1]:
import pandas as pd
import numpy as np
import gzip
import urllib.request
from datetime import datetime
import re

# Downloading the Calgary HTTP dataset
url = "ftp://ita.ee.lbl.gov/traces/calgary_access_log.gz"
filename = "calgary_access_log.gz"

# Download the file only if it's not already present locally
try:
    urllib.request.urlretrieve(url, filename)
    print("Dataset downloaded successfully.")
except Exception as e:
    print(f"Error downloading the file: {e}")


Dataset downloaded successfully.


In [2]:
# Reading the compressed .gz file and storing log entries in a list
log_entries = []

try:
    with gzip.open(filename, 'rt', errors='ignore') as f:
        for line in f:
            log_entries.append(line.strip())
    print(f"Total lines loaded: {len(log_entries)}")
except Exception as e:
    print(f"Error reading the compressed file: {e}")


Total lines loaded: 726739


In [3]:
# Defining a function to parse a single log line
def parse_log_line(line):
    pattern = r'(\S+) (\S+) (\S+) \[([^\]]+)\] "(.*?)" (\d{3}) (\S+)'
    match = re.match(pattern, line)
    
    if not match:
        return None  # Malformed line

    remotehost, rfc931, authuser, date_str, request, status, bytes_sent = match.groups()

    # Convert timestamp to datetime object
    try:
        timestamp = datetime.strptime(date_str, "%d/%b/%Y:%H:%M:%S %z")
    except:
        return None

    # Parse request line into method, filename, and protocol
    try:
        method, filename, protocol = request.split()
    except:
        method, filename, protocol = None, None, None

    # Convert bytes to int or set to 0 if "-"
    bytes_sent = int(bytes_sent) if bytes_sent.isdigit() else 0

    return {
        'remotehost': remotehost,
        'rfc931': rfc931,
        'authuser': authuser,
        'timestamp': timestamp,
        'method': method,
        'filename': filename,
        'protocol': protocol,
        'status': int(status),
        'bytes': bytes_sent
    }


In [4]:
# Applying the parser to all log lines
parsed_logs = [parse_log_line(line) for line in log_entries]
parsed_logs = [entry for entry in parsed_logs if entry is not None]

# Creating a DataFrame from parsed log entries
df = pd.DataFrame(parsed_logs)
print("Parsed log entries:", len(df))
df.head()


Parsed log entries: 724910


Unnamed: 0,remotehost,rfc931,authuser,timestamp,method,filename,protocol,status,bytes
0,local,-,-,1994-10-24 13:41:41-06:00,GET,index.html,HTTP/1.0,200,150
1,local,-,-,1994-10-24 13:41:41-06:00,GET,1.gif,HTTP/1.0,200,1210
2,local,-,-,1994-10-24 13:43:13-06:00,GET,index.html,HTTP/1.0,200,3185
3,local,-,-,1994-10-24 13:43:14-06:00,GET,2.gif,HTTP/1.0,200,2555
4,local,-,-,1994-10-24 13:43:15-06:00,GET,3.gif,HTTP/1.0,200,36403


In [5]:
# Function to extract file extension
def get_extension(filename):
    if filename and '.' in filename:
        return filename.split('.')[-1].lower()
    return 'none'  # for cases like '/' or empty

# Apply to dataframe
df['file_extension'] = df['filename'].apply(get_extension)

# Show unique extensions as a sanity check
print("Sample file extensions:", df['file_extension'].unique()[:10])
df.head()


Sample file extensions: ['html' 'gif' 'xbm' 'aiff' 'rgb' 'ps' 'none' 'txt' 'tiff' 'mpeg']


Unnamed: 0,remotehost,rfc931,authuser,timestamp,method,filename,protocol,status,bytes,file_extension
0,local,-,-,1994-10-24 13:41:41-06:00,GET,index.html,HTTP/1.0,200,150,html
1,local,-,-,1994-10-24 13:41:41-06:00,GET,1.gif,HTTP/1.0,200,1210,gif
2,local,-,-,1994-10-24 13:43:13-06:00,GET,index.html,HTTP/1.0,200,3185,html
3,local,-,-,1994-10-24 13:43:14-06:00,GET,2.gif,HTTP/1.0,200,2555,gif
4,local,-,-,1994-10-24 13:43:15-06:00,GET,3.gif,HTTP/1.0,200,36403,gif


In [6]:
##Questions And Answers

In [7]:
# Q1: Total number of log records
total_log_records = len(df)
print("Total log records:", total_log_records)


Total log records: 724910


In [8]:
# Q2: Number of unique hosts (remotehost)
unique_hosts = df['remotehost'].nunique()
print("Unique hosts:", unique_hosts)


Unique hosts: 2


In [9]:
# Convert 'timestamp' column to datetime type
df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')

# Check if conversion worked
print(df['timestamp'].head())


0   1994-10-24 13:41:41-06:00
1   1994-10-24 13:41:41-06:00
2   1994-10-24 13:43:13-06:00
3   1994-10-24 13:43:14-06:00
4   1994-10-24 13:43:15-06:00
Name: timestamp, dtype: datetime64[ns, UTC-06:00]


In [10]:
#Q3 Convert timestamp to date string in 'dd-MMM-yyyy' format
df['date_str'] = df['timestamp'].dt.strftime('%d-%b-%Y')

# Group by date and count unique filenames
unique_files_per_date = df.groupby('date_str')['filename'].nunique().to_dict()

# Sample output
for date, count in list(unique_files_per_date.items())[:5]:
    print(f"{date}: {count}")


01-Aug-1995: 669
01-Jul-1995: 387
01-Jun-1995: 590
01-May-1995: 467
01-Oct-1995: 552


In [11]:

# Q4: Count how many HTTP requests resulted in a 404 error
count_404 = df[df['status'] == 404].shape[0]
print("Number of 404 responses:", count_404)


Number of 404 responses: 23586


In [12]:
#q5 Filter 404 errors and count filenames
top_404_files = (
    df[df['status'] == 404]
    .groupby('filename')
    .size()
    .sort_values(ascending=False)
    .head(15)
    .reset_index(name='count')
)

print(top_404_files)


      filename  count
0   index.html   4694
1    4115.html    902
2    1611.html    649
3     5698.xbm    585
4      710.txt    408
5    2002.html    258
6     2177.gif    193
7     10695.ps    161
8    6555.html    153
9      487.gif    152
10    151.html    149
11     40.html    148
12    3414.gif    148
13     488.gif    148
14    9678.gif    142


In [13]:
#q6 Top 15 file extensions causing 404 errors
top_404_exts = (
    df[df['status'] == 404]
    .groupby('file_extension')
    .size()
    .sort_values(ascending=False)
    .head(15)
    .reset_index(name='count')
)

print(top_404_exts)


   file_extension  count
0            html  12145
1             gif   7337
2             xbm    824
3              ps    754
4             jpg    531
5             txt    508
6            none    141
7             htm    108
8             cgi     77
9            gif"     47
10            com     45
11              z     41
12            dvi     40
13           com/     37
14             ca     36


In [14]:
#q7 Filter for July 1995 dates
july_mask = df['timestamp'].dt.strftime('%b-%Y') == 'Jul-1995'

# Remove rows where bytes is '-' or missing, convert to int
df_july = df[july_mask].copy()
df_july = df_july[df_july['bytes'].apply(lambda x: str(x).isdigit())]
df_july['bytes'] = df_july['bytes'].astype(int)

# Group by date and sum bytes
bandwidth_per_day = df_july.groupby(df_july['timestamp'].dt.strftime('%d-%b-%Y'))['bytes'].sum().to_dict()

print(bandwidth_per_day)


{'01-Jul-1995': 11349799, '02-Jul-1995': 8656918, '03-Jul-1995': 13596612, '04-Jul-1995': 26573988, '05-Jul-1995': 19541225, '06-Jul-1995': 19755015, '07-Jul-1995': 9427822, '08-Jul-1995': 5403491, '09-Jul-1995': 4660556, '10-Jul-1995': 14917754, '11-Jul-1995': 22507207, '12-Jul-1995': 17367065, '13-Jul-1995': 15989234, '14-Jul-1995': 19186430, '15-Jul-1995': 15773233, '16-Jul-1995': 9016378, '17-Jul-1995': 19601338, '18-Jul-1995': 17099761, '19-Jul-1995': 17851725, '20-Jul-1995': 20752623, '21-Jul-1995': 25491617, '22-Jul-1995': 8136259, '23-Jul-1995': 9593870, '24-Jul-1995': 22308265, '25-Jul-1995': 24561635, '26-Jul-1995': 24995540, '27-Jul-1995': 25969995, '28-Jul-1995': 36460693, '29-Jul-1995': 11700624, '30-Jul-1995': 23189598, '31-Jul-1995': 30730715}


In [15]:
#q8 Extract hour from timestamp
df['hour'] = df['timestamp'].dt.hour

# Count requests per hour
hourly_requests = df['hour'].value_counts().sort_index().to_dict()

print(hourly_requests)


{0.0: 11598, 1.0: 9913, 2.0: 9403, 3.0: 8147, 4.0: 7832, 5.0: 8283, 6.0: 9798, 7.0: 11930, 8.0: 17351, 9.0: 21683, 10.0: 25717, 11.0: 28665, 12.0: 26845, 13.0: 30089, 14.0: 29792, 15.0: 28149, 16.0: 28287, 17.0: 23332, 18.0: 17862, 19.0: 17325, 20.0: 17492, 21.0: 15969, 22.0: 14588, 23.0: 13613}


In [16]:
# q9 Top 10 most requested filenames
top_10_filenames = df['filename'].value_counts().head(10)
top_10_list = list(top_10_filenames.items())

print(top_10_list)


[('index.html', 139528), ('3.gif', 24006), ('2.gif', 23595), ('4.gif', 8018), ('244.gif', 5148), ('5.html', 5010), ('4097.gif', 4874), ('8870.jpg', 4492), ('6733.gif', 4278), ('8472.gif', 3843)]


In [17]:
#q10 HTTP response code distribution
status_code_counts = df['status'].value_counts().to_dict()

print(status_code_counts)


{200: 568348, 304: 97792, 302: 30295, 404: 23586, 403: 4743, 401: 46, 501: 43, 500: 42, 400: 15}
