In [1]:
#!/usr/bin/env python
# coding: utf-8

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
import re
from itertools import combinations
from scipy import stats
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import networkx as nx
from docx import Document
from docx.shared import Inches

# --- Paths and setup ---
csv_path = 'Evaluation Excels/evaluation_result_2025-05-29-00-26-33.csv'
output_dir = 'plots'
report_path = 'evaluation_report.docx'

os.makedirs(output_dir, exist_ok=True)

# --- Load data ---
with open(csv_path, 'r', encoding='utf-8') as f:
    lines = f.readlines()

header = lines[0].rstrip('\n').split(';')
n_cols = len(header)
data = []
for line in lines[1:]:
    fields = line.rstrip('\n').split(';', n_cols - 1)
    if len(fields) < n_cols:
        fields += [''] * (n_cols - len(fields))
    data.append(fields)

df = pd.DataFrame(data, columns=header)

# Convert numeric columns
numeric_cols = ['IBS', 'CFS', 'CFS without Floor and Walls', 'VCIS', 'NCIS',
                'TotalElementsCount', 'PipelineDurationSeconds']
for col in numeric_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')

# Extract categories

def parse_categories(fname):
    parts = os.path.basename(fname).split('_')
    case = parts[1] if len(parts) > 1 else ''
    prompt = parts[2] if len(parts) > 2 else ''
    validator = ''
    if len(parts) > 3:
        validator = 'with Validator' if 'withValidator' in parts[3] else 'without Validator'
    model = ''
    if len(parts) > 4 and not re.match(r'\d{4}-\d{2}-\d{2}', parts[4]):
        model = parts[4]
    return pd.Series({'Case': case, 'Prompt': prompt, 'Validator': validator, 'Model': model})

df[['Case', 'Prompt', 'Validator', 'Model']] = df['FileName'].apply(parse_categories)

# Parse timestamps

def parse_timestamp(fname):
    parts = os.path.basename(fname).split('_')
    ts_str = '_'.join(parts[-2:])
    try:
        return pd.to_datetime(ts_str, format='%Y-%m-%d_%H-%M-%S')
    except:
        return pd.NaT


df['Timestamp'] = df['FileName'].apply(parse_timestamp)
df['Date'] = df['Timestamp'].dt.date

# Identify score columns
score_cols = [c for c in ['IBS', 'CFS', 'CFS without Floor and Walls', 'VCIS', 'NCIS'] if c in df.columns]
if not score_cols:
    score_cols = df.select_dtypes(include=[np.number]).columns.tolist()

# --- Initialize Word report ---
doc = Document()
doc.add_heading('Evaluation Report', 0)

# Descriptive statistics

desc = df[score_cols].describe().T

doc.add_heading('Descriptive Statistics (Overall)', level=1)
# Create table
table = doc.add_table(rows=1, cols=len(desc.columns) + 1)
hdr_cells = table.rows[0].cells
hdr_cells[0].text = 'Metric'
for i, col_name in enumerate(desc.columns):
    hdr_cells[i+1].text = str(col_name)
for idx, row in desc.iterrows():
    row_cells = table.add_row().cells
    row_cells[0].text = str(idx)
    for j, val in enumerate(row):
        row_cells[j+1].text = f"{val:.3f}"


# ---- Descriptive statistics for PipelineDurationSeconds and TotalElementsCount ----
extra_cols = []
if 'PipelineDurationSeconds' in df.columns:
    extra_cols.append('PipelineDurationSeconds')
if 'TotalElementsCount' in df.columns:
    extra_cols.append('TotalElementsCount')

if extra_cols:
    desc_extra = df[extra_cols].describe().T  # wie oben: count, mean, std, min, 25%, 50%, 75%, max

    doc.add_heading('Descriptive Statistics (Duration & Elements)', level=1)
    table2 = doc.add_table(rows=1, cols=len(desc_extra.columns) + 1)
    hdr_cells2 = table2.rows[0].cells
    hdr_cells2[0].text = 'Metric'
    for i, col_name in enumerate(desc_extra.columns):
        hdr_cells2[i+1].text = str(col_name)
    for idx, row in desc_extra.iterrows():
        row_cells2 = table2.add_row().cells
        row_cells2[0].text = str(idx)
        for j, val in enumerate(row):
            row_cells2[j+1].text = f"{val:.3f}"


# --- Descriptive statistics by Validator ---
if 'Validator' in df.columns and not df['Validator'].isnull().all():
    doc.add_heading('Descriptive Statistics by Validator', level=1)
    for validator_value, group in df.groupby('Validator'):
        doc.add_heading(f'{validator_value}', level=2)
        desc_val = group[score_cols].describe().T
        table_val = doc.add_table(rows=1, cols=len(desc_val.columns) + 1)
        hdr_cells = table_val.rows[0].cells
        hdr_cells[0].text = 'Metric'
        for i, col_name in enumerate(desc_val.columns):
            hdr_cells[i+1].text = str(col_name)
        for idx, row in desc_val.iterrows():
            row_cells = table_val.add_row().cells
            row_cells[0].text = str(idx)
            for j, val in enumerate(row):
                row_cells[j+1].text = f"{val:.3f}"



# --- 2. Category Distributions and plots ---
for cat in ['Prompt', 'Case', 'Validator', 'Model']:
    if cat in df.columns:
        counts = df[cat].value_counts()
        # Plot
        plt.figure(figsize=(6,4))
        counts.plot(kind='bar')
        plt.title(f'Count by {cat}')
        plt.xlabel(cat)
        plt.ylabel('Count')
        plt.tight_layout()
        plot_path = f'{output_dir}/bar_{cat}.png'
        plt.savefig(plot_path)
        plt.close()
        # Add to report
        doc.add_heading(f'Counts by {cat}', level=1)
        tbl = doc.add_table(rows=1, cols=2)
        hdr = tbl.rows[0].cells
        hdr[0].text = cat
        hdr[1].text = 'Count'
        for k, v in counts.items():
            rc = tbl.add_row().cells
            rc[0].text = str(k)
            rc[1].text = str(v)
        doc.add_picture(plot_path, width=Inches(6))

# --- 3. Save all remaining plots in the report ---
doc.add_heading('All Other Plots', level=1)
for img in sorted(os.listdir(output_dir)):
    if img.endswith('.png') and not img.startswith('bar_'):
        doc.add_heading(img.replace('.png', '').replace('_', ' ').title(), level=2)
        doc.add_picture(os.path.join(output_dir, img), width=Inches(6))

# Save report
doc.save(report_path)
print(f'Report saved to {report_path}')


Report saved to evaluation_report.docx
