In [16]:
import re
import pandas as pd
from statistics import mean, stdev

# Function to read the log file and filter lines containing "(Test)"
def read_log_file(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
    # Filter lines containing "(Test)"
    test_logs = [line.strip() for line in lines if "(Test)" in line]
    return test_logs

# Function to capture 30 entries after every 120 entries
def capture_entries(data_list, capture_size=30, skip_size=120, start_index=0):
    captured_entries = []

    # Loop through the data list with a step of 120
    while start_index < len(data_list):
        # Capture the next 30 entries
        captured_entries.extend(data_list[start_index:start_index + capture_size])
        # Move to the next block after skipping 120 entries
        start_index += skip_size
    
    return captured_entries

# Define a pattern to extract model metrics
pattern = re.compile(
    r"INFO ([\w-]+) \(Test\) - MAE: ([\d.]+), MSE: ([\d.]+), RMSE: ([\d.]+), MAPE: ([\d.]+), R2: ([\d.-]+), Median AE: ([\d.]+), Explained Variance: ([\d.]+)"
)

# Function to extract metrics from a list of log entries
def extract_metrics_from_chunk(log_data_list):
    metrics_dict = {}
    
    for log_entry in log_data_list:
        match = pattern.search(log_entry)
        if match:
            model, mae, mse, rmse, mape, r2, median_ae, explained_variance = match.groups()
            if model not in metrics_dict:
                metrics_dict[model] = {
                    "MAE": [],
                    "MSE": [],
                    "RMSE": [],
                    "MAPE": [],
                    "R2": [],
                    "Median AE": [],
                    "Explained Variance": []
                }
            
            # Convert values to float and store them
            metrics_dict[model]["MAE"].append(float(mae))
            metrics_dict[model]["MSE"].append(float(mse))
            metrics_dict[model]["RMSE"].append(float(rmse))
            metrics_dict[model]["MAPE"].append(float(mape))
            metrics_dict[model]["R2"].append(float(r2))
            metrics_dict[model]["Median AE"].append(float(median_ae))
            metrics_dict[model]["Explained Variance"].append(float(explained_variance))
    
    # Compute mean and std, keeping list features at the end
    final_metrics_dict = {}
    for model, metrics in metrics_dict.items():
        mean_std_dict = {}
        
        # First, calculate mean and std
        for metric, values in metrics.items():
            mean_std_dict[f"{metric}_mean"] = mean(values)
            mean_std_dict[f"{metric}_std"] = stdev(values) if len(values) > 1 else 0.0
        
        # Then, add the original list features at the end
        for metric, values in metrics.items():
            mean_std_dict[metric] = values
        
        # Store in the final dictionary
        final_metrics_dict[model] = mean_std_dict

    return final_metrics_dict


# Function to process data in chunks of 30 entries and extract metrics for each chunk
def process_in_chunks(data_list, chunk_size=30):
    chunk_index = 1
    df_list = []
    for i in range(0, len(data_list), chunk_size):
        chunk = data_list[i:i + chunk_size]
        #print(f"Chunk starting at index {i}:")
        
        # Extract metrics for this chunk
        chunk_metrics = extract_metrics_from_chunk(chunk)
        
        # Convert to DataFrame for better readability
        chunk_metrics_df = pd.DataFrame(chunk_metrics).T
        chunk_metrics_df.reset_index(inplace=True)
        chunk_metrics_df.rename(columns={'index': 'Model'}, inplace=True)
        chunk_metrics_df["index_count"] = f'Experiment {chunk_index}'
        df_list.append(chunk_metrics_df)
        chunk_index += 1

     # Concatenate all chunk DataFrames into one
    final_df = pd.concat(df_list, ignore_index=True)
    
    return final_df




In [17]:
#file_path = "C:/Users/rkbho/Downloads/metadata_regression_disjoint_data.log"
file_path = "C:/Users/rkbho/Downloads/meta_data_regression_full.log"

In [18]:
id_list = ["MMCS0002", "MMCS0003", "MMCS0005", "MMCS0007"]

In [19]:
features = ["baseline_features"]*24 + ["++mean_and_std_features"]*24 + ["++lag_features"]*24 + ["++lag_difference_features"]*24
len(features)

time_frames = ["3hr"]*6 + ["6hr"]*6 + ["12hr"]*6 + ["24hr"]*6
final_time_frames = time_frames*4

# Step 1: Read the log file and filter (Test) logs
log_data_list = read_log_file(file_path)
print(len(log_data_list))

dataframes = {}

for index in range(0, len(id_list)): 

    # Step 2: Capture 30 entries after every 120 entries from the 2400 log entries
    captured_entries = capture_entries(log_data_list, capture_size=30, skip_size=120, start_index=index*30)

    print(len(captured_entries))
    # Step 3: Extract metrics and calculate averages
    mean_metrics_df = process_in_chunks(captured_entries)

    mean_metrics_df["features_class"] = features
    mean_metrics_df["time_frame"] = final_time_frames 
    mean_metrics_df["user_id"] = id_list[index]

    print(mean_metrics_df.shape)
    str_key = f"{id_list[index]}_forecasting"
    dataframes[str_key] = mean_metrics_df

# Create a Pandas Excel writer using XlsxWriter as the engine
with pd.ExcelWriter('forecasting_output.xlsx', engine='xlsxwriter') as writer:
    for sheet_name, df in dataframes.items():
        print(sheet_name)
        df.to_excel(writer, sheet_name=sheet_name, index=False)

1920
480
(96, 26)
480
(96, 26)
480
(96, 26)
480
(96, 26)
MMCS0002_forecasting
MMCS0003_forecasting
MMCS0005_forecasting
MMCS0007_forecasting


In [20]:
file_path = "C:/Users/rkbho/Downloads/meta_data_classification_full.log"

In [21]:
# Function to read the log file and filter lines containing "(Test)"
def read_log_file(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
    # Filter lines containing "(Test)"
    test_logs = [line.strip() for line in lines if "(Test) - Accuracy" in line]
    return test_logs

# Function to capture 30 entries after every 120 entries
def capture_entries(data_list, capture_size=30, skip_size=120, start_index=0):
    captured_entries = []

    # Loop through the data list with a step of 120
    while start_index < len(data_list):
        # Capture the next 30 entries
        captured_entries.extend(data_list[start_index:start_index + capture_size])
        # Move to the next block after skipping 120 entries
        start_index += skip_size
    
    return captured_entries

# Define a pattern to extract model metrics for classification tasks
pattern = re.compile(
    r"INFO ([\w-]+) \(Test\) - Accuracy: ([\d.]+), Precision: ([\d.]+), Recall: ([\d.]+), F1 Score: ([\d.]+), AUC-ROC: ([\d.]+)"
)

def extract_metrics_from_chunk(log_data_list):
    """
    Extracts metrics from a list of log entries and returns a dictionary with mean, std, and list of values.
    """
    metrics_dict = {}
    
    # Extract metrics from log entries
    for log_entry in log_data_list:
        match = pattern.search(log_entry)
        if match:
            model, accuracy, precision, recall, f1_score, auc_roc = match.groups()
            
            # Initialize dictionary for the model if not already present
            if model not in metrics_dict:
                metrics_dict[model] = {
                    "Accuracy": [], 
                    "Precision": [], 
                    "Recall": [], 
                    "F1 Score": [], 
                    "AUC-ROC": []
                }
            
            # Convert values to float and store them in the dictionary
            metrics_dict[model]["Accuracy"].append(float(accuracy))
            metrics_dict[model]["Precision"].append(float(precision))
            metrics_dict[model]["Recall"].append(float(recall))
            metrics_dict[model]["F1 Score"].append(float(f1_score))
            metrics_dict[model]["AUC-ROC"].append(float(auc_roc))
    
    # Compute mean and std, keeping list features at the end
    final_metrics_dict = {}
    for model, metrics in metrics_dict.items():
        mean_std_dict = {}
        
        # First, calculate mean and std
        for metric, values in metrics.items():
            mean_std_dict[f"{metric}_mean"] = mean(values)
            mean_std_dict[f"{metric}_std"] = stdev(values) if len(values) > 1 else 0.0
        
        # Then, add the original list features at the end
        for metric, values in metrics.items():
            mean_std_dict[metric] = values
        
        # Store in the final dictionary
        final_metrics_dict[model] = mean_std_dict

    return final_metrics_dict


In [22]:
features = ["baseline_features"]*24 + ["++mean_and_std_features"]*24 + ["++lag_features"]*24 + ["++lag_difference_features"]*24
len(features)

time_frames = ["3hr"]*6 + ["6hr"]*6 + ["12hr"]*6 + ["24hr"]*6
final_time_frames = time_frames*4

# Step 1: Read the log file and filter (Test) logs
log_data_list = read_log_file(file_path)
print(len(log_data_list))

dataframes = {}

for index in range(0, len(id_list)): 

    # Step 2: Capture 30 entries after every 120 entries from the 2400 log entries
    captured_entries = capture_entries(log_data_list, capture_size=30, skip_size=120, start_index=index*30)

    print(len(captured_entries))
    # Step 3: Extract metrics and calculate averages
    mean_metrics_df = process_in_chunks(captured_entries)

    mean_metrics_df["features_class"] = features
    mean_metrics_df["time_frame"] = final_time_frames 
    mean_metrics_df["user_id"] = id_list[index]

    print(mean_metrics_df.shape)
    str_key = f"{id_list[index]}_classification"
    dataframes[str_key] = mean_metrics_df

# Create a Pandas Excel writer using XlsxWriter as the engine
with pd.ExcelWriter('classification_output.xlsx', engine='xlsxwriter') as writer:
    for sheet_name, df in dataframes.items():
        print(sheet_name)
        df.to_excel(writer, sheet_name=sheet_name, index=False)

1920
480
(96, 20)
480
(96, 20)
480
(96, 20)
480
(96, 20)
MMCS0002_classification
MMCS0003_classification
MMCS0005_classification
MMCS0007_classification
