## Initialize the spark session and create the dataframe

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql import Row

# Initialize Spark session
spark = SparkSession.builder.appName("LogData").getOrCreate()

# Sample log entries
log_entries = [
    '2023-12-21T11:38:36.197907 | INFO | req_10351 | session_5265 | user_383 | view_page | GET | /about | Referrer: https://www.bing.com | IP: 103.131.153.85 | Agent: Mozilla/5.0 (iPhone; CPU iPhone OS 13_3 like Mac OS X) | Response Time: 0.75s | Product ID: prod_710 | Cart Size: 9 | Checkout Status: failed | Token: token_19632 | Auth Method: JWT | Auth Level: user | Correlation ID: corr_58741 | Server IP: 89.184.23.62 | Port: 7191 | Protocol: HTTP | 204 No Content | Detail: The server successfully processed the request.',
    '2023-12-21T15:10:52.761271 | INFO | req_62600 | session_7173 | user_398 | view_page | GET | /about | Referrer: https://www.google.com | IP: 233.32.143.163 | Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) | Response Time: 1.05s | Product ID: prod_533 | Cart Size: 10 | Checkout Status: completed | Token: token_46281 | Auth Method: Basic | Auth Level: user | Correlation ID: corr_83929 | Server IP: 170.111.70.153 | Port: 8725 | Protocol: HTTP | 200 OK | Detail: Request succeeded.'
]

# Split each log entry based on the delimiter
log_entry_lists = [entry.split(" | ") for entry in log_entries]

# Create a list of Row objects
rows = [Row(*entry_list) for entry_list in log_entry_lists]

# Define columns
columns =  [
    "timestamp", "log_level", "request_id", "session_id", "user_id", "action",
    "http_method", "url", "referrer_url", "ip_address", "user_agent", "response_time",
    "product_id", "cart_size", "checkout_status", "token", "auth_method", "auth_level",
    "correlation_id", "server_ip", "port_number", "protocol", "status_and_detail"
]

# Create a DataFrame
log_df = spark.createDataFrame(rows, columns)


## Create database

In [0]:
# Replace 'your_database_name' with the desired database name
database_name = 'formerSalamendars'

# Create the database
spark.sql(f"CREATE DATABASE IF NOT EXISTS {database_name}")

# Set the current database to the newly created database
spark.sql(f"USE {database_name}")

DataFrame[]

In [0]:
# Define the Hive table name and location
hive_table_name = 'formerSalamendars.logs'
table_location = '/mnt/former_salamenders/logs'


log_df.write.format('parquet').mode('overwrite').saveAsTable(hive_table_name)