In [49]:
import numpy as np #A library for numerical computing (arrays, math functions, etc.).

In [50]:
import pandas as pd #Library for data analysis & manipulation (DataFrames, tables).

In [51]:
import matplotlib.pyplot as plt  #Library for plotting graphs.


In [52]:
from sklearn.model_selection import train_test_split
#Module for splitting datasets, cross-validation, etc. and Function that splits data into training and testing sets.

In [53]:
from sklearn.linear_model import LinearRegression
# Class to create a linear regression model.

In [54]:
from sklearn.metrics import mean_squared_error, r2_score #Measures how far predictions are from actual values.and Tools for measuring model accuracy.


In [55]:
from sklearn.preprocessing import StandardScaler #Tools for preparing data before training.


In [56]:
from sklearn.pipeline import Pipeline #Allows chaining multiple steps together (scaling + modeling) into one object.


In [57]:
pd.set_option("display.max_columns", None) #How many columns to show when printing a DataFrame.


In [58]:
pd.set_option("display.precision", 3) #How many decimals to show


In [59]:

DATA_PATH = "Housing_modified.csv"

In [60]:
df = pd.read_csv(DATA_PATH) #Pandas function to read CSV files into a DataFrame.


In [61]:
print("Shape (rows, columns):", df.shape) #A tuple (rows, columns) showing how many rows and columns your DataFrame has.


Shape (rows, columns): (545, 13)


In [62]:
display(df.head(545)) #Shows the 545 rows of the DataFrame to preview your data.


Unnamed: 0,guestroom,hotwaterheating,parking,area,furnishingstatus,mainroad,airconditioning,bathrooms,price,basement,stories,prefarea,bedrooms
0,no,no,1,5900,unfurnished,no,no,2,4045214,yes,2,no,4
1,no,no,0,6500,furnished,yes,yes,2,6536696,no,3,yes,3
2,no,no,0,4040,semi-furnished,yes,no,1,3693404,no,1,no,2
3,no,no,0,5000,semi-furnished,yes,yes,1,6342007,no,2,no,3
4,no,no,0,3960,furnished,yes,no,1,2765070,no,1,no,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
540,no,no,0,6000,unfurnished,yes,yes,2,6673593,no,4,no,4
541,no,no,0,5450,semi-furnished,yes,yes,2,6137918,yes,1,yes,4
542,no,yes,1,4500,furnished,yes,no,2,4393705,no,3,no,3
543,no,no,0,4040,unfurnished,yes,no,1,3316448,no,1,no,2


In [63]:
print("\nData types:") 
display(df.dtypes) #df.dtypes → Lists the data type (int64, float64, object, etc.) of each column.



Data types:


guestroom           object
hotwaterheating     object
parking              int64
area                 int64
furnishingstatus    object
mainroad            object
airconditioning     object
bathrooms            int64
price                int64
basement            object
stories              int64
prefarea            object
bedrooms             int64
dtype: object

Code Part	Why It’s Used
DATA_PATH	Stores the dataset location in one place for easy changes.
pd.read_csv()	Loads CSV data into pandas DataFrame.
df.shape	Checks dataset size quickly.
df.head()	Peeks at the first few rows to understand structure.
df.dtypes	Shows what type of data each column has.
df.isna().sum()	Finds missing values in each column.
display()	Makes output tables look clean in notebooks.
print()	Adds clear labels to the output.

In [64]:
print("\nMissing values per column:") #\n adds spacing in output.
display(df.isna().sum())  #Returns a DataFrame of True/False values showing where data is missing and Counts the number of True values per column (i.e., number of missing entries).
#Finds missing values in each column.


Missing values per column:


guestroom           0
hotwaterheating     0
parking             0
area                0
furnishingstatus    0
mainroad            0
airconditioning     0
bathrooms           0
price               0
basement            0
stories             0
prefarea            0
bedrooms            0
dtype: int64

In [65]:
df_proc = df.copy() #Prevents modifying the original data accidentally.


In [66]:
yn_cols = ["mainroad", "guestroom", "basement", "hotwaterheating", "airconditioning", "prefarea"] #yn_cols → A list of column names that contain "Yes"/"No" values.

#Example: if "mainroad" = "Yes", convert to 1; "No" → 0.


In [67]:
def map_yes_no(series):  # Handle spaces/case safely: " Yes " -> "yes"
     return series.astype(str).str.strip().str.lower().map({"yes": 1, "no": 0})
# '''map_yes_no(series) → Converts "Yes"/"No" text in a Series (column) into 1/0.

# series.astype(str) → Ensures the column is treated as strings.

# .str.strip() → Removes extra spaces (" Yes " → "Yes").

# .str.lower() → Converts everything to lowercase ("YES" → "yes").

# .map({"yes": 1, "no": 0}) → Maps "yes" → 1, "no" →'''


In [70]:
for c in yn_cols:
    df_proc[c] = map_yes_no(df_proc[c]) 
#     for c in yn_cols: → Loop through each Yes/No column.

# df_proc[c] = ... → Replace that column in the dataframe.

# map_yes_no(df_proc[c]) → Apply the conversion function to each column.


In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
CC Timetable Generator — Log Analysis (one-file script)

What it computes:
- Total API requests served (by scanning lines containing HTTP method GET/POST)
- Endpoint popularity (path counts)
- Performance metrics per endpoint: average, max, and p95 response time
- Users: unique IDs and per-year counts (year inferred from user ID tokens)
- Timetable generation insights:
  - Total timetables generated
  - Average timetables generated per hour (if timestamps span can be computed)
  - Algorithm usage counts (Backtracking vs Iterative random sampling)

Extras:
- Global success/error rates
- Per-endpoint success/error rates
- Requests per hour (if timestamps exist)

Assumptions / Robustness:
- Handles log lines where fields may appear in any order.
- Extracts endpoint from request like "GET /api/generate?x=1 HTTP/1.1" -> "/api/generate"
- Response time picked from common patterns (response_time=123ms, time=85ms, duration=0.12s, "123ms", etc.)
- User IDs from tokens like user=2021ABC123, uid:19BCE1234, user_id=2022-xyz…, etc.
- Year inferred as the first 4-digit number in 2010–2035 found within a captured user ID token.
- Timestamps parsed from typical patterns (e.g., "2025-08-20 10:15:23", "[2025-08-20 10:15:23,456]", ISO8601).

Usage:
    python analyze_logs.py logfile.txt
    python analyze_logs.py logfile.txt --out report.md

If you have multiple files:
    python analyze_logs.py logs/*.log --out combined_report.md
"""

import argparse
import re
import sys
import statistics
from collections import defaultdict, Counter
from datetime import datetime

# -----------------------------
# Regex patterns (liberal)
# -----------------------------

# HTTP method + path + optional status
METHOD_RE = re.compile(r'\b(?P<method>GET|POST)\b', re.IGNORECASE)

# Try to extract request target like `GET /path?query ...` => group 'path'
REQUEST_LINE_RE = re.compile(
    r'\b(?:GET|POST)\s+(?P<path>/[^\s\?"]+)', re.IGNORECASE
)

# Status code: often 200, 400 etc. Allow 3-digit at word boundary.
STATUS_RE = re.compile(r'\b(?P<status>\d{3})\b')

# Response time patterns (labelled preferred):
# e.g., response_time=123ms, time: 85ms, duration=0.12s, rt=250ms
LABELLED_TIME_RE = re.compile(
    r'\b(?:response[_\- ]?time|duration|latency|time|rt)\s*[:=]\s*(?P<num>\d+(?:\.\d+)?)\s*(?P<Unit>ms|s)\b',
    re.IGNORECASE
)

# Unlabelled "123ms" fallback (avoid matching timestamps by requiring ms/s)
UNLABELLED_TIME_RE = re.compile(
    r'\b(?P<num>\d+(?:\.\d+)?)\s*(?P<Unit>ms|s)\b'
)

# User identifiers: try common keys
USER_TOKEN_RE = re.compile(
    r'\b(?:user(?:_id)?|uid|id)\s*[:=]\s*(?P<uid>[A-Za-z0-9_\-:/\.]+)',
    re.IGNORECASE
)

# A 4-digit year embedded in a UID (common campus years)
YEAR_IN_UID_RE = re.compile(r'\b(20(1\d|2\d|3[0-5]))\b')

# Timetable generation lines and algorithm names
# We count a line as "timetable generated" if it contains "timetable" + "generat"
TIMETABLE_LINE_RE = re.compile(r'\btime[\w\- ]*table\b.*\bgenerat', re.IGNORECASE)
ALGO_BACKTRACK_RE = re.compile(r'\bbacktracking\b', re.IGNORECASE)
ALGO_ITERATIVE_RE = re.compile(r'\biterative\s+random\s+sampling\b|\biterative\b', re.IGNORECASE)

# Timestamp patterns (try a few common ones)
TS_PATTERNS = [
    # [2025-08-20 10:15:23,456]
    r'\[(?P<ts>\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2}(?:,\d{3})?)\]',
    # 2025-08-20 10:15:23 or 2025-08-20T10:15:23
    r'(?P<ts>\d{4}-\d{2}-\d{2}[ T]\d{2}:\d{2}:\d{2}(?:\.\d{3,6})?)',
    # 20/08/2025 10:15:23
    r'(?P<ts>\d{2}/\d{2}/\d{4}\s+\d{2}:\d{2}:\d{2})',
]
TS_RES = [re.compile(p) for p in TS_PATTERNS]

def parse_ts(s):
    for rx in TS_RES:
        m = rx.search(s)
        if not m:
            continue
        ts = m.group('ts')
        # Try multiple datetime formats
        for fmt in (
            "%Y-%m-%d %H:%M:%S,%f",
            "%Y-%m-%d %H:%M:%S.%f",
            "%Y-%m-%d %H:%M:%S",
            "%Y-%m-%dT%H:%M:%S.%f",
            "%Y-%m-%dT%H:%M:%S",
            "%d/%m/%Y %H:%M:%S",
        ):
            try:
                return datetime.strptime(ts, fmt)
            except ValueError:
                continue
    return None

def ms_from_match(num_str, unit_str):
    val = float(num_str)
    unit = unit_str.lower()
    if unit == 'ms':
        return val
    if unit == 's':
        return val * 1000.0
    return None

def extract_latency_ms(line):
    # Prefer labelled matches
    m = LABELLED_TIME_RE.search(line)
    if m:
        return ms_from_match(m.group('num'), m.group('Unit'))
    # Fallback to unlabelled "123ms" (but try not to pick up timestamps)
    # We'll choose the smallest value that looks like a latency, to avoid e.g., a larger unrelated seconds value.
    candidates = []
    for m in UNLABELLED_TIME_RE.finditer(line):
        ms = ms_from_match(m.group('num'), m.group('Unit'))
        if ms is not None:
            candidates.append(ms)
    if candidates:
        return min(candidates)
    return None

def extract_endpoint(line):
    m = REQUEST_LINE_RE.search(line)
    if not m:
        return None
    path = m.group('path')
    # normalize: strip trailing slashes (except root)
    if len(path) > 1 and path.endswith('/'):
        path = path[:-1]
    return path

def extract_method(line):
    m = METHOD_RE.search(line)
    return m.group('method').upper() if m else None

def extract_status(line):
    # status can appear multiple times; prefer a 3xx/4xx/5xx/2xx near end, but keep it simple
    # We'll take the last 3-digit number in the line as a heuristic.
    statuses = [int(m.group('status')) for m in STATUS_RE.finditer(line)]
    if not statuses:
        return None
    return statuses[-1]

def extract_user_id_and_year(line):
    uid = None
    year = None
    m = USER_TOKEN_RE.search(line)
    if m:
        uid = m.group('uid')
        y = YEAR_IN_UID_RE.search(uid)
        if y:
            year = int(y.group(1))
    else:
        # Fallback: if we see a token that looks like a year elsewhere in the line, but only use it if
        # there's also a word "user" nearby to avoid miscounting.
        if re.search(r'\buser\b', line, re.IGNORECASE):
            y = YEAR_IN_UID_RE.search(line)
            if y:
                year = int(y.group(1))
    return uid, year

def human_ms(ms):
    if ms is None:
        return "n/a"
    if ms >= 1000:
        return f"{ms/1000:.2f}s"
    return f"{ms:.0f}ms"

def p95(values):
    if not values:
        return None
    v = sorted(values)
    # 95th percentile (nearest-rank method)
    idx = max(0, int(round(0.95 * (len(v) - 1))))
    return v[idx]

def main():
    ap = argparse.ArgumentParser(description="Analyze CC Timetable Generator logs and produce a Markdown report.")
    ap.add_argument("files", nargs="+", help="Path(s) to log file(s)")
    ap.add_argument("--out", help="Optional path to write Markdown report")
    args = ap.parse_args()

    total_requests = 0
    global_status = Counter()
    endpoints = defaultdict(lambda: {
        "count": 0,
        "latencies": [],
        "status": Counter(),
        "methods": Counter(),
    })

    users_unique = set()
    year_counts = Counter()

    tt_total = 0
    algo_counts = Counter()

    timestamps = []

    for path in args.files:
        try:
            with open(path, "r", encoding="utf-8", errors="replace") as f:
                for line in f:
                    # Timestamp
                    ts = parse_ts(line)
                    if ts:
                        timestamps.append(ts)

                    # Request-level parsing
                    method = extract_method(line)
                    endpoint = extract_endpoint(line)
                    status = extract_status(line)
                    latency = extract_latency_ms(line)

                    if method and endpoint:
                        total_requests += 1
                        endpoints[endpoint]["count"] += 1
                        endpoints[endpoint]["methods"][method] += 1
                        if latency is not None:
                            endpoints[endpoint]["latencies"].append(latency)
                        if status is not None:
                            endpoints[endpoint]["status"][status] += 1
                            global_status[status] += 1

                    # Users
                    uid, year = extract_user_id_and_year(line)
                    if uid:
                        users_unique.add(uid)
                    if year:
                        year_counts[year] += 1

                    # Timetable generation lines & algorithms
                    if TIMETABLE_LINE_RE.search(line):
                        tt_total += 1
                    if ALGO_BACKTRACK_RE.search(line):
                        algo_counts["Backtracking"] += 1
                    if ALGO_ITERATIVE_RE.search(line):
                        algo_counts["Iterative random sampling"] += 1

        except FileNotFoundError:
            print(f"[WARN] File not found: {path}", file=sys.stderr)
        except Exception as e:
            print(f"[WARN] Error reading {path}: {e}", file=sys.stderr)

    # Compute time span (for rates)
    rate_note = ""
    avg_tt_per_hour = None
    reqs_per_hour = None
    if timestamps:
        tmin = min(timestamps)
        tmax = max(timestamps)
        elapsed_sec = max(1, (tmax - tmin).total_seconds())
        hours = elapsed_sec / 3600.0
        if hours > 0:
            avg_tt_per_hour = tt_total / hours
            reqs_per_hour = total_requests / hours
            rate_note = f"(from {tmin} to {tmax}, ~{hours:.2f}h span)"
    # Prepare report
    lines = []
    lines.append("# CC Timetable Generator — Log Report")
    if rate_note:
        lines.append(f"_Time window detected: {rate_note}_")
    lines.append("")

    # Total API requests
    lines.append("## Total API requests served")
    lines.append(f"- **Total**: {total_requests}")
    if global_status:
        successes = sum(c for s, c in global_status.items() if 200 <= s < 300)
        client_err = sum(c for s, c in global_status.items() if 400 <= s < 500)
        server_err = sum(c for s, c in global_status.items() if 500 <= s < 600)
        lines.append(f"- **Success (2xx)**: {successes}")
        lines.append(f"- **Client errors (4xx)**: {client_err}")
        lines.append(f"- **Server errors (5xx)**: {server_err}")
        if total_requests:
            err_rate = (client_err + server_err) * 100.0 / total_requests
            lines.append(f"- **Error rate**: {err_rate:.2f}%")
    if reqs_per_hour is not None:
        lines.append(f"- **Requests/hour**: {reqs_per_hour:.2f}")
    lines.append("")

    # Endpoint popularity & performance
    lines.append("## Endpoint Popularity & Performance")
    if not endpoints:
        lines.append("_No endpoints detected (check log format or patterns)._")
    else:
        # Sort by count desc
        for ep, data in sorted(endpoints.items(), key=lambda kv: kv[1]["count"], reverse=True):
            cnt = data["count"]
            methods = ", ".join(f"{m}:{c}" for m,c in sorted(data["methods"].items()))
            # Latency
            lat = data["latencies"]
            avg_ms = statistics.mean(lat) if lat else None
            max_ms = max(lat) if lat else None
            p95_ms = p95(lat) if lat else None
            # Status mix
            s_ok = sum(c for s,c in data["status"].items() if 200 <= s < 300)
            s_4 = sum(c for s,c in data["status"].items() if 400 <= s < 500)
            s_5 = sum(c for s,c in data["status"].items() if 500 <= s < 600)
            lines.append(f"### {ep}")
            lines.append(f"- Requests: **{cnt}**")
            lines.append(f"- Methods: {methods if methods else 'n/a'}")
            lines.append(f"- Avg latency: {human_ms(avg_ms)}  |  Max: {human_ms(max_ms)}  |  P95: {human_ms(p95_ms)}")
            if (s_4 + s_5 + s_ok) > 0:
                lines.append(f"- Status mix: 2xx={s_ok}, 4xx={s_4}, 5xx={s_5}")
            lines.append("")

    # Users
    lines.append("## Users")
    lines.append(f"- **Unique users/IDs**: {len(users_unique)}")
    if year_counts:
        lines.append("- **Users by year**:")
        for y, c in sorted(year_counts.items()):
            lines.append(f"  - {y}: {c}")
    else:
        lines.append("_No year information detected in user IDs._")
    lines.append("")

    # Timetable generation insights
    lines.append("## Timetable Generation Insights")
    lines.append(f"- **Total timetables generated**: {tt_total}")
    if avg_tt_per_hour is not None:
        lines.append(f"- **Average timetables per hour**: {avg_tt_per_hour:.2f}")
    else:
        lines.append("- **Average timetables per hour**: n/a (timestamps not found)")
    if algo_counts:
        lines.append("- **Algorithm usage:**")
        for k,v in sorted(algo_counts.items(), key=lambda kv: kv[0]):
            lines.append(f"  - {k}: {v}")
    else:
        lines.append("- **Algorithm usage:** n/a")
    lines.append("")

    # Nice extras summary
    lines.append("## Additional Metrics")
    if reqs_per_hour is not None:
        lines.append(f"- Requests/hour: {reqs_per_hour:.2f}")
    # Top endpoints quick glance
    if endpoints:
        top_ep = max(endpoints.items(), key=lambda kv: kv[1]["count"])[0]
        lines.append(f"- Busiest endpoint: `{top_ep}`")
    lines.append("")

    report = "\n".join(lines)

    if args.out:
        try:
            with open(args.out, "w", encoding="utf-8") as f:
                f.write(report)
            print(f"[OK] Report written to {args.out}")
        except Exception as e:
            print(f"[WARN] Failed to write {args.out}: {e}", file=sys.stderr)
            print(report)
    else:
        print(report)

if __name__ == "__main__":
    main()


SyntaxError: invalid syntax (3713095338.py, line 2)