In [22]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import os
import re

# Funzione per estrarre e convertire i valori della settima colonna in liste di float
def get_seventh_column_values(df, type_val, module_val, name_val):
    filtered_df = df[(df['type'] == type_val) & (df['module'] == module_val) & (df['name'] == name_val)]
    if filtered_df.empty:
        return None
    values = filtered_df.iloc[:, 6].str.split().apply(lambda x: list(map(float, x)))
    return values.explode().astype(float).values

# Funzione per calcolare l'intervallo di confidenza
def confidence_interval(data, confidence=0.95):
    mean = np.mean(data)
    sem = stats.sem(data)  # Standard error of the mean
    margin = sem * stats.t.ppf((1 + confidence) / 2, len(data) - 1)
    return mean, mean - margin, mean + margin

# Funzione per rimuovere '_vec' dalla fine del nome del file
def remove_vec_suffix(file_name):
    if file_name.endswith('_vec.csv'):
        return file_name[:-8]
    else:
        return file_name

# Funzione per ottenere il numero finale dal nome del file
def get_file_number(file_name):
    match = re.search(r'\d+$', file_name)
    if match:
        return int(match.group())
    else:
        return 0  # Se non viene trovato nessun numero, restituisci 0

# Funzione per analizzare un singolo file CSV
def analyze_csv(file_path, ignored_files, results):
    df = pd.read_csv(file_path)
    description = df[df['attrname'] == 'description']['attrvalue'].values
    description = description[0] if len(description) > 0 else "N/A"

    pDistribution = get_seventh_column_values(df, "vector", "TandemQueueSystem.Server", "pDistribution")
    vDistribution = get_seventh_column_values(df, "vector", "TandemQueueSystem.Server", "vDistribution")
    lifeTime = get_seventh_column_values(df, "vector", "TandemQueueSystem.sink", "lifeTime:vector")

    if pDistribution is None or vDistribution is None or lifeTime is None:
        ignored_files.append((remove_vec_suffix(os.path.basename(file_path)), description))
        return

    min_length = min(len(pDistribution), len(vDistribution), len(lifeTime))
    pDistribution = pDistribution[:min_length]
    vDistribution = vDistribution[:min_length]
    lifeTime = lifeTime[:min_length]

    queueLength_Q1 = get_seventh_column_values(df, "vector", "TandemQueueSystem.Q1", "queueLength:vector")
    queueLength_Q2 = get_seventh_column_values(df, "vector", "TandemQueueSystem.Q2", "queueLength:vector")

    if queueLength_Q1 is None or queueLength_Q2 is None:
        ignored_files.append((remove_vec_suffix(os.path.basename(file_path)), description))
        return

    Cw = 1  # Definisci il valore di Cw

    mean_W, ci_low_W, ci_high_W = confidence_interval(lifeTime)
    U_values = vDistribution - pDistribution - (Cw * lifeTime)
    mean_U, ci_low_U, ci_high_U = confidence_interval(U_values)
    max_W = np.max(lifeTime)
    min_W = np.min(lifeTime)
    mean_max_W, ci_low_max_W, ci_high_max_W = confidence_interval(lifeTime, confidence=0.95)
    mean_min_W, ci_low_min_W, ci_high_min_W = confidence_interval(lifeTime, confidence=0.95)
    utilization_Q1 = queueLength_Q1 / np.sum(queueLength_Q1)
    utilization_Q2 = queueLength_Q2 / np.sum(queueLength_Q2)
    mean_utilization_Q1, ci_low_utilization_Q1, ci_high_utilization_Q1 = confidence_interval(utilization_Q1)
    mean_utilization_Q2, ci_low_utilization_Q2, ci_high_utilization_Q2 = confidence_interval(utilization_Q2)

    results.append({
        "File": remove_vec_suffix(os.path.basename(file_path)),
        "File_Number": get_file_number(remove_vec_suffix(os.path.basename(file_path))),
        "Description": description,
        "mean_W": mean_W,
        "ci_low_W": ci_low_W,
        "ci_high_W": ci_high_W,
        "mean_U": mean_U,
        "ci_low_U": ci_low_U,
        "ci_high_U": ci_high_U,
        "max_W": max_W,
        "ci_low_max_W": ci_low_max_W,
        "ci_high_max_W": ci_high_max_W,
        "min_W": min_W,
        "ci_low_min_W": ci_low_min_W,
        "ci_high_min_W": ci_high_min_W,
        "mean_utilization_Q1": mean_utilization_Q1,
        "ci_low_utilization_Q1": ci_low_utilization_Q1,
        "ci_high_utilization_Q1": ci_high_utilization_Q1,
        "mean_utilization_Q2": mean_utilization_Q2,
        "ci_low_utilization_Q2": ci_low_utilization_Q2,
        "ci_high_utilization_Q2": ci_high_utilization_Q2
    })

# Funzione per analizzare tutti i file CSV in una directory
def analyze_directory(directory_path):
    ignored_files = []
    results = []
    for file_name in os.listdir(directory_path):
        if file_name.endswith('.csv') and not file_name.endswith('_sca.csv'):
            file_path = os.path.join(directory_path, file_name)
            analyze_csv(file_path, ignored_files, results)
    
    if results:
        results_df = pd.DataFrame(results)
        results_df = results_df.sort_values(by='File_Number')  # Ordina per numero del file
        results_df.drop(columns=['File_Number'], inplace=True)  # Rimuovi la colonna File_Number
        results_df.to_csv(os.path.join('./results_summary.csv'), index=False)
    
    if ignored_files:
        ignored_files_sorted = sorted(ignored_files, key=lambda x: get_file_number(x[0]))  # Ordina gli ignorati per numero del file
        ignored_df = pd.DataFrame(ignored_files_sorted, columns=['File', 'Description'])
        ignored_df.to_csv(os.path.join('./ignored_files.csv'), index=False)

# Esempio di utilizzo
directory_path = './results_CSV'
analyze_directory(directory_path)
