In [6]:
combined_regex = r'^(?P<client>\S+) \S+ (?P<userid>\S+) \[(?P<datetime>[^\]]+)\] "(?P<method>[A-Z]+) (?P<path>[^ "]+)? HTTP/[0-9.]+" (?P<status>[0-9]{3}) (?P<size>[0-9]+|-) "(?P<referrer>[^"]*)" "(?P<useragent>[^"]*)'
columns = ['client', 'userid', 'datetime', 'method', 'path', 'status', 'size', 'referer', 'user_agent']

In [7]:
import psycopg2
from dotenv import load_dotenv
import os

load_dotenv()
conn = psycopg2.connect(os.getenv("CONNECTION_URL"))

In [8]:
def logdb(log_args):
    """Logs request and response in the PostgreSQL database following the Common Log Format extended
    extended with user-agent and referer."""
    with conn.cursor() as cursor:
        try:
            cursor.execute(
                """
                INSERT INTO server_log
                VALUES (%s, to_timestamp(%s, 'DD/Mon/YYYY:HH:MI:SS TZHTZM'), %s, %s, %s, %s, %s, %s)
                """,
                (
                    log_args[0],
                    log_args[2],
                    log_args[3],
                    log_args[4],
                    log_args[5],
                    log_args[6],
                    log_args[7],
                    log_args[8]
                )
            )
        except Exception as e:
            print("Exceptiion: ", e)
            conn.rollback()
        else:
            conn.commit()


In [9]:
import re
from tqdm import tqdm


def log(logfile, errors_file):
    with open(logfile) as source_file:
        linenumber = 1
        for line in tqdm(source_file, total=200_000):
            try:
                log_line = re.findall(combined_regex, line)[0]
                logdb(log_line)
                
            except Exception as e:
                print(f"Exception: {e} at line {linenumber}")
                with open(errors_file, 'at') as errfile:
                    print((line, str(e)), file=errfile)
                continue
            linenumber += 1
            if linenumber == 200_000:
                break

In [10]:
log("/home/suchitg/Desktop/access.log", "errors.txt")

  7%|▋         | 14745/200000 [00:15<03:11, 964.93it/s] 

Exception: list index out of range at line 14593


 16%|█▋        | 32726/200000 [00:33<02:53, 966.34it/s] 

Exception: list index out of range at line 32543


 21%|██        | 41365/200000 [00:41<02:21, 1119.41it/s]

Exception: list index out of range at line 41176


 68%|██████▊   | 136458/200000 [04:30<02:49, 375.08it/s]

Exception: list index out of range at line 136408


 78%|███████▊  | 156425/200000 [05:28<02:15, 321.07it/s]

Exception: list index out of range at line 156364


200003it [07:33, 440.59it/s]                            
