## Dependencies

In [None]:
!pip install faker
!pip install openai
!pip install elasticsearch
!pip install sklearn

## Log generation function

In [None]:
import openai
import ast
import json
from elasticsearch import Elasticsearch, helpers
from sklearn.feature_extraction.text import TfidfVectorizer
from faker import Faker
import random

fake = Faker()

# 1. Apache HTTP Server (Common Log Format)
def generate_apache_log():
    return '{RemoteHost} - - [{Timestamp}] "{RequestMethod} {RequestURI} {Protocol}" {StatusCode} {ResponseSize}'.format(
        RemoteHost=fake.ipv4(),
        Timestamp=fake.date_time_this_year().strftime('%d/%b/%Y:%H:%M:%S %z'),
        RequestMethod=fake.http_method(),
        RequestURI=fake.uri(),
        Protocol='HTTP/1.1',
        StatusCode=random.choice([200, 404, 500]),
        ResponseSize=random.randint(100, 10000)
    )

# 2. Nginx (Combined Log Format)
def generate_nginx_log():
    return '{RemoteAddress} - {RemoteUser} [{Timestamp}] "{RequestMethod} {RequestURI} {Protocol}" {StatusCode} {ResponseSize} "{Referer}" "{UserAgent}"'.format(
        RemoteAddress=fake.ipv4(),
        RemoteUser='-',
        Timestamp=fake.date_time_this_year().strftime('%d/%b/%Y:%H:%M:%S %z'),
        RequestMethod=fake.http_method(),
        RequestURI=fake.uri(),
        Protocol='HTTP/1.1',
        StatusCode=random.choice([200, 404, 500]),
        ResponseSize=random.randint(100, 10000),
        Referer=fake.uri(),
        UserAgent=fake.user_agent()
    )

# 3. Syslog (RFC 5424)
def generate_syslog():
    return '<{Priority}>{Version} {Timestamp} {Hostname} {AppName} {ProcID} {MsgID} {StructuredData} {Message}'.format(
        Priority=random.randint(1, 191),
        Version=1,
        Timestamp=fake.date_time_this_year().isoformat(),
        Hostname=fake.hostname(),
        AppName=fake.word(),
        ProcID=random.randint(1000, 9999),
        MsgID=random.randint(1000, 9999),
        StructuredData='-',
        Message=fake.sentence()
    )

# 4. AWS CloudTrail
def generate_aws_cloudtrail_log():
    return '{{"eventVersion": "{EventVersion}", "userIdentity": {{"type": "IAMUser", "userName": "{UserName}"}}, "eventTime": "{Timestamp}", "eventSource": "{EventSource}", "eventName": "{EventName}", "awsRegion": "{AwsRegion}", "sourceIPAddress": "{SourceIPAddress}", "userAgent": "{UserAgent}", "requestParameters": {{"key": "value"}}, "responseElements": {{"key": "value"}}, "requestID": "{RequestId}", "eventID": "{EventId}", "eventType": "AwsApiCall", "recipientAccountId": "{RecipientAccountId}"}}'.format(
        EventVersion='1.08',
        UserName=fake.user_name(),
        Timestamp=fake.date_time_this_year().isoformat(),
        EventSource='s3.amazonaws.com',
        EventName='GetObject',
        AwsRegion='us-east-1',
        SourceIPAddress=fake.ipv4(),
        UserAgent=fake.user_agent(),
        RequestId=fake.uuid4(),
        EventId=fake.uuid4(),
        RecipientAccountId=fake.random_number(digits=12)
    )

# 5. Microsoft Windows Event Log
def generate_windows_event_log():
    return '<Event xmlns="http://schemas.microsoft.com/win/2004/08/events/event"><System><Provider Name="{ProviderName}"/><EventID>{EventID}</EventID><Level>{Level}</Level><TimeCreated SystemTime="{Timestamp}"/><SourceName>{SourceName}</SourceName><Computer>{Computer}</Computer></System><EventData>{Message}</EventData></Event>'.format(
        ProviderName=fake.word(),
        EventID=random.randint(1000, 9999),
        Level=random.randint(1, 5),
        Timestamp=fake.date_time_this_year().isoformat(),
        SourceName=fake.word(),
        Computer=fake.hostname(),
        Message=fake.sentence()
    )

# 6. Linux Audit Log
def generate_linux_audit_log():
    return 'type={AuditType} msg=audit({Timestamp}): {Message}'.format(
        AuditType=fake.word(),
        Timestamp=fake.date_time_this_year().isoformat(),
        Message=fake.sentence()
    )

def generate_logs(sources, total_logs, random_logs):
    # Mapping source names to their corresponding log generation functions
    source_to_function = {
        'apache': generate_apache_log,
        'nginx': generate_nginx_log,
        'syslog': generate_syslog,
        'aws_cloudtrail': generate_aws_cloudtrail_log,
        'windows_event': generate_windows_event_log,
        'linux_audit': generate_linux_audit_log,
    }
    
    # Calculate the number of logs to generate for each source
    num_sources = len(sources)
    logs_per_source = [total_logs // num_sources] * num_sources
    if random_logs:
        for i in range(total_logs % num_sources):
            logs_per_source[i] += 1
        random.shuffle(logs_per_source)
    
    # Generate the logs and append them to the list
    generated_logs = []
    for source, num_logs in zip(sources, logs_per_source):
        log_function = source_to_function[source]
        for _ in range(num_logs):
            generated_logs.append(log_function())
    
    return generated_logs



## Log Expansion

In [None]:
# Example usage
sources_to_use = ['apache']
total_logs_to_generate = 15
random_logs_per_source = True
logs = generate_logs(sources_to_use, total_logs_to_generate, random_logs_per_source)


stringifiedPromptsArray = json.dumps(logs)

print("Logs: ")
print(logs)

prompts = [
    {
    "role": "user",
    "content": stringifiedPromptsArray
}
]

batchInstruction = {
    "role":
    "system",
    "content":
    "Explain what happened for each log line of the array. Return a python array of the explanation. Only the array, no text around it or any extra comment, nothing else than the array should be in the answer. Don't forget in your completion to give the day, date and year of the log. Interpret some of the log content if you can, for example you have to translate what an error code 500."
}

prompts.append(batchInstruction)
print("ChatGPT: ")


# Define the OpenAI API key and Elasticsearch connection details
openai_api_key = "OPENAI_API_KEY"

# Initialize the OpenAI API client
openai.api_key = openai_api_key

stringifiedBatchCompletion = openai.ChatCompletion.create(model="gpt-3.5-turbo",
                                         messages=prompts,
                                         max_tokens=1000)
print(stringifiedBatchCompletion.choices[0].message.content)
batchCompletion = ast.literal_eval(stringifiedBatchCompletion.choices[0].message.content)



## Log Vectorization

In [None]:

# Initialize the Elasticsearch client
es = Elasticsearch(
    ['ELASTIC_CLUSTER_HOSTNAME:ELASTIC_CLUSTER_PORT'],
    basic_auth=('ELASTIC_USERNAME', 'ELASTIC_PASSWORD'),
    verify_certs=False
)

# Define the index configuration
index_config = {
    "mappings": {
        "properties": {
            "description_vectorized": {
                "type": "dense_vector",
                "dims": 768,
                "index": True,
                "similarity": "cosine"
            }
        }
    }
}

# Create the index
response = es.indices.create(index='logs', body=index_config)

# Generate the sequence of JSON documents for a bulk index operation
bulk_index_body = []
for index, log in enumerate(batchCompletion):
    document = {
        "_index": "logs", 
        "pipeline": "vectorize-log",
        "_source": {
            "text_field": log, "log": logs[index]
        }
    }
    bulk_index_body.append(document)

# Convert the bulk index body to a single string with newline separators
print("Bulk request: ")
print(bulk_index_body)

try:
    response = helpers.bulk(es, bulk_index_body)
    print ("\nRESPONSE:", response)
except Exception as e:
    print("\nERROR:", e)


## Semantic Search

In [None]:

# Search ElasticSearch index and return body and URL of the result
def ESSearch(query_text):
  # Elasticsearch query (BM25) and kNN configuration for hybrid search
  query = {
    "bool": {
      "filter": [{
        "exists": {
          "field": "description_vectorized"
        }
      }]
    }
  }

  knn = {
    "field": "description_vectorized",
    "k": 1,
    "num_candidates": 20,
    "query_vector_builder": {
      "text_embedding": {
        "model_id": "sentence-transformers__all-distilroberta-v1",
        "model_text": query_text
      }
    },
    "boost": 24
  }

  fields = ["text_field"]
  index = 'logs'
  resp = es.search(index=index,
                   query=query,
                   knn=knn,
                   fields=fields,
                   size=1,
                   source=False)

  print(resp['hits']['hits'][0]['fields']['text_field'][0])
  return resp['hits']['hits'][0]['fields']['text_field'][0]


ESSearch("Were there any error in March?")