In [19]:
!pip3 install rake-nltk
!pip3 install summa
!pip3 install nltk
!pip3 install gensim
!pip3 install pandas
!pip3 install Counter
!pip3 install keywords



In [20]:
from rake_nltk import Rake
from summa import keywords
import nltk
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
from collections import Counter
import pandas as pd
from nltk.tokenize import RegexpTokenizer
import csv
from datetime import datetime, timedelta
import re

In [21]:
# Convert GMT to epoch
def convert_gmt_to_epoch(gmt_datetime_str):
    gmt_datetime = datetime.strptime(gmt_datetime_str, "%d/%b/%Y:%H:%M:%S %z")
    return int(gmt_datetime.timestamp())

In [22]:
# HTTP mapping
http_method_mapping = {
    'GET': 0,
    'HEAD': 1,
    'POST': 2,
    'PUT': 3,
    'DELETE': 4,
    'CONNECT': 5,
    'OPTIONS': 6,
    'TRACE': 7,
    'PATCH': 8,
    'PROPFIND': 9,
}

In [23]:
# Function to parse HTTP request for both cases
def parse_http_request(http_request):
    http_request_pattern = re.compile(r'(?P<http_method>\w+) (?P<resource_path>.*?) HTTP/\d\.\d')
    http_request_match = http_request_pattern.match(http_request)

    if http_request_match:
        http_method = http_request_match.group('http_method')
        resource_path = http_request_match.group('resource_path')

        return {
            'http_method': http_method_mapping.get(http_method, -1),
            'resource_path': resource_path,
        }
    else:
        return None

In [24]:
# Padding of the IP address
def parse_ip_address(ip_address):
    segments = ip_address.split('.')
    padded_segments = [segment.zfill(3) for segment in segments]
    padded_ip_str = ''.join(padded_segments)
    return int(padded_ip_str)

In [25]:
# Parse the log lines
def parse_log_line(log_line):
    # Use regular expression to extract information from the log line
    pattern = re.compile(r'(?P<ip_address>\d+\.\d+\.\d+\.\d+) - - \[(?P<datetime>.*?)\] "(?P<http_request>.*?)" (?P<status_code>\d+) (?P<bytes_sent>\d+) "(-)" "(?P<user_agent>[^"]*)"')

    match = pattern.match(log_line)

    if match:
        data = match.groupdict()
        # Padding of the IP address
        data['ip_address'] = parse_ip_address(data['ip_address'])

        # Convert GMT to IST for datetime
        data['datetime'] = convert_gmt_to_epoch(data['datetime'])

        # Parse HTTP request
        http_request_info = parse_http_request(data['http_request'])
        if http_request_info:
            data.update(http_request_info)

        # Clean the resource path
        clean_resource_path = re.sub(r'\?.*$', '', data['http_request'])
        data['resource_path'] = clean_resource_path

        # Remove the 'http_request' key
        del data['http_request']

        return data
    else:
        return None

In [26]:
# Final fucntion that writes to the csv file
def write_to_csv(log_data, csv_file_path):
    # Write the data to a CSV file
    with open(csv_file_path, mode='w', newline='') as csv_file:
        fieldnames = ['ip_address', 'datetime', 'http_method', 'resource_path', 'status_code', 'bytes_sent', 'user_agent']
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(log_data)

In [27]:
# File paths
log_file_path = './ssl_access_log-20231107'
csv_file_path = './updated_ssl_access_log-20231107.csv'

# Read and parse the log file
log_data = []
with open(log_file_path, 'r') as log_file:
    for line in log_file:
        parsed_data = parse_log_line(line)
        if parsed_data:
            log_data.append(parsed_data)

# Write the parsed data to a CSV file
write_to_csv(log_data, csv_file_path)

In [10]:
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/hareeshsenthil/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/hareeshsenthil/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/hareeshsenthil/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [11]:
df = pd.read_csv('./updated_ssl_access_log-20231107.csv')

user_agent = df['user_agent'].tolist()
resource_path = df['resource_path'].tolist()

In [12]:
def extract_keywords_rake_user(text):
    return text.split('/')

def extract_keywords_rake(text):
    rake = Rake()
    rake.extract_keywords_from_text(text)
    return rake.get_ranked_phrases()

In [13]:
for index, row in df.iterrows():
    user_keywords = extract_keywords_rake_user(str(row['user_agent']))
    resource_keywords = extract_keywords_rake(str(row['resource_path']))
    # Add the extracted keywords to new columns
    df.at[index, 'user_keywords'] = ', '.join(user_keywords)
    df.at[index, 'resource_keywords'] = ', '.join(resource_keywords)

In [None]:
# Define a custom tokenizer that preserves words containing hyphens and commas
custom_tokenizer = RegexpTokenizer(r'\w+[-,]\w+|\w+')

# Apply the custom tokenizer to the 'user_keywords' column
df['tokenized_user_keywords'] = df['user_keywords'].apply(lambda x: custom_tokenizer.tokenize(x.lower()))
df['tokenized_resource_keywords'] = df['resource_keywords'].apply(lambda x: custom_tokenizer.tokenize(x.lower()))

# Train Word2Vec models on the entire corpus
model_user_keywords = Word2Vec(df['tokenized_user_keywords'], vector_size=1, window=1, min_count=1, workers=1)
model_resource_keywords = Word2Vec(df['tokenized_resource_keywords'], vector_size=1, window=1, min_count=1, workers=1)

# Get embeddings for each row
df['user_keywords_embeddings'] = df['tokenized_user_keywords'].apply(
    lambda keywords: [model_user_keywords.wv[word] for word in keywords if word in model_user_keywords.wv]
)
df['resource_keywords_embeddings'] = df['tokenized_resource_keywords'].apply(
    lambda keywords: [model_resource_keywords.wv[word] for word in keywords if word in model_resource_keywords.wv]
)

In [33]:
# Save the DataFrame to a new CSV file
df.to_csv('./final.csv', index=False)