In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

suffix = "reas"


# Load files
anno_df = pd.read_excel('../wound_groundtruth.xlsx')


#To obtain video_anomaly_similarities_output_reas.xlsx, we only need to keep columns regarding reasoning similarities in video_anomaly_similarities_output.xlsx

desc_df = pd.read_excel(f'video_anomaly_similarities_output_{suffix}.xlsx')

# Process video names
anno_df['Image Name'] = anno_df['Image Name'].str.replace('.jpg', '', regex=False)
desc_df['video_name'] = desc_df['video_name'].str.replace('.jpg', '', regex=False)

# Merge
merged_df = pd.merge(anno_df, desc_df, left_on='Image Name', right_on='video_name')

# Split by type with same ratio
train_list = []
test_list = []

for label in ['Abrasions', 'Ingrown Nails', 'Stab Wound', 'Bruises', 'Cut', 'Laceration', 'Burns']:
    subset = merged_df[merged_df['Label'] == label]
    train, test = train_test_split(subset, test_size=0.2, random_state=42)
    train_list.append(train)
    test_list.append(test)
    print(len(test))

# Combine and drop columns
train_df = pd.concat(train_list).reset_index(drop=True).drop(columns=['Label', 'Image Name'])
test_df = pd.concat(test_list).reset_index(drop=True).drop(columns=['Label','Image Name'])

train_df.to_excel(f'train_dataset_{suffix}.xlsx', index=False)
test_df.to_excel(f'test_dataset_{suffix}.xlsx', index=False)


In [None]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold

suffix = "reas"

# Load files
anno_df = pd.read_excel('../wound_groundtruth.xlsx')
desc_df = pd.read_excel('train_dataset_reas.xlsx')

# Clean video names
anno_df['Image Name'] = anno_df['Image Name'].str.replace('.jpg', '', regex=False)
desc_df['video_name'] = desc_df['video_name'].str.replace('.jpg', '', regex=False)

# Merge datasets
merged_df = pd.merge(anno_df, desc_df, left_on='Image Name', right_on='video_name')

# Optional: Create a copy of Label for stratification
merged_df['strat_label'] = merged_df['Label']

# Initialize StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Generate and save each training fold
for fold, (train_idx, val_idx) in enumerate(skf.split(merged_df, merged_df['strat_label'])):
    train_fold = merged_df.iloc[train_idx].reset_index(drop=True)
    # Drop unnecessary columns
    train_fold = train_fold.drop(columns=['Label', 'Image Name', 'strat_label'])

    # Save to Excel
    train_fold.to_excel(f'train_dataset_fold{fold}_{suffix}.xlsx', index=False)

print("✅ 5-fold training splits saved.")


In [None]:
import pandas as pd

# Load the Excel file

#To obtain embedded_only_video_anomalies_reas.xlsx, we only need to keep columns regarding reasoning embedding in embedded_only_video_anomalies.xlsx


df = pd.read_excel("embedded_only_video_anomalies_reas.xlsx")

# Identify anomaly and reasoning columns
anomaly_cols = [col for col in df.columns if 'anomaly' in col.lower()]
reas_cols = [col for col in df.columns if 'reas' in col.lower()]

# List of anomaly categories
categories = ["Abrasions", "Ingrown Nails", "Stab Wound", "Bruises", "Cut", "Laceration", "Burns"]

os.makedirs("intermediate data", exist_ok=True)


# Process each category
for category in categories:
    df_category = df.copy()
    for i in range(5):
        anomaly_col = anomaly_cols[i]
        reas_col = reas_cols[i]
        
        # Keep only the rows where this model predicted the current category
        mask = df[anomaly_col] == category
        df_category.loc[~mask, [reas_col]] = None  # Remove reasoning if not predicted as this category

    # Save to Excel
    
    filename = f"intermediate data/video_outputs_anomaly_{category.replace(' ', '_')}.xlsx"
    df_category.to_excel(filename, index=False)


In [None]:
import pandas as pd
import numpy as np
import ast
import re
import os

# List of anomaly categories
categories = ["Abrasions", "Ingrown Nails", "Stab Wound", "Bruises", "Cut", "Laceration", "Burns"]

# Function to safely parse embedding strings with missing commas
def safe_parse_emb(emb_str):
    if isinstance(emb_str, str):
        emb_str_clean = re.sub(r'([0-9e\.\+\-])\s+([\-0-9])', r'\1,\2', emb_str)
        try:
            return np.array(ast.literal_eval(emb_str_clean))
        except:
            return np.nan
    return np.nan

# Convert numpy array to string: space-separated, square brackets, no commas
def emb_to_str(emb):
    return '[' + ' '.join(f"{x:.8f}" for x in emb) + ']'

# List of models
models = ["claude3.5sonnet", "claude3.5", "claude3.7", "gpt4o-mini", "gpt4o"]

# Process each category
for category in categories:
    print(f"Processing category: {category}")
    
    file_path = f"intermediate data/video_outputs_anomaly_{category.replace(' ', '_')}.xlsx"
    df = pd.read_excel(file_path)
    imputed_rows = []

    for _, row in df.iterrows():
        video_result = row.copy()
        reas_embs = []

        # Collect reasoning embeddings for matching predictions
        for model in models:
            reas_emb = safe_parse_emb(row.get(f'{model}-reas_emb', ''))
            anomaly = row.get(f'{model}-anomaly', '')
            if anomaly == category and isinstance(reas_emb, np.ndarray):
                reas_embs.append(reas_emb)

        # Compute mean reasoning embedding
        reas_mean = np.mean(reas_embs, axis=0) if reas_embs else None

        # Impute for models that did not predict this category
        for model in models:
            anomaly = row.get(f'{model}-anomaly', '')
            if anomaly != category and reas_mean is not None:
                video_result[f'{model}-reas_emb'] = emb_to_str(reas_mean)

        imputed_rows.append(video_result)

    # Final DataFrame
    imputed_df = pd.DataFrame(imputed_rows)

    # Drop anomaly columns
    for model in models:
        col = f'{model}-anomaly'
        if col in imputed_df.columns:
            imputed_df.drop(columns=[col], inplace=True)

    # Save result
    output_path = f"intermediate data/video_outputs_anomaly_{category.replace(' ', '_')}_imputed_clean.xlsx"
    imputed_df.to_excel(output_path, index=False)
    print(f"Saved cleaned, imputed file to: {output_path}")


In [None]:
import pandas as pd
import numpy as np
import itertools
from sklearn.metrics.pairwise import cosine_similarity

# Anomaly categories
categories = ["Abrasions", "Ingrown Nails", "Stab Wound", "Bruises", "Cut", "Laceration", "Burns"]

# Model names
model_names = ["claude3.5sonnet", "claude3.5", "claude3.7", "gpt4o-mini", "gpt4o"]

# Parse stringified NumPy arrays into Python lists
def parse_embedding(x):
    if isinstance(x, str):
        try:
            return np.fromstring(x.strip("[]"), sep=" ").tolist()
        except:
            return np.nan
    return x

# Compute pairwise cosine similarities for a given DataFrame and prefix
def compute_similarities(df, prefix):
    similarities = pd.DataFrame()
    similarities["video_name"] = df["video_name"]
    for model1, model2 in itertools.combinations(model_names, 2):
        col1 = f"{model1}-{prefix}_emb"
        col2 = f"{model2}-{prefix}_emb"
        sim_col = f"{model1}_vs_{model2}_{prefix}_sim"
        sim_values = []
        for v1, v2 in zip(df[col1], df[col2]):
            try:
                sim = cosine_similarity([v1], [v2])[0][0]
            except:
                sim = np.nan
            sim_values.append(sim)
        similarities[sim_col] = sim_values
    return similarities

# Process all categories
for category in categories:
    print(f"Processing category: {category}")
    cat_file = f"intermediate data/video_outputs_anomaly_{category.replace(' ', '_')}_imputed_clean.xlsx"
    df = pd.read_excel(cat_file)

    # Apply parsing to all reasoning embedding columns
    for model in model_names:
        col = f"{model}-reas_emb"
        if col in df.columns:
            df[col] = df[col].apply(parse_embedding)

    # Compute reasoning similarity
    reas_sim = compute_similarities(df, "reas")

    # Save result
    output_path = f"intermediate data/video_anomaly_similarities_output_reasoningUS_anomaly_{category.replace(' ', '_')}_reas.xlsx"
    reas_sim.to_excel(output_path, index=False)
    print(f"Saved: {output_path}")


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# List of categories
categories = ["Abrasions", "Ingrown Nails", "Stab Wound", "Bruises", "Cut", "Laceration", "Burns"]

# Load ground truth
anno_df = pd.read_excel('../wound_groundtruth.xlsx')
anno_df['Image Name'] = anno_df['Image Name'].str.replace('.jpg', '', regex=False)

# Process each category
for category in categories:
    suffix = f"reasoningUS_anomaly_{category.replace(' ', '_')}_reas"

    # Load similarity file
    sim_file = f"intermediate data/video_anomaly_similarities_output_{suffix}.xlsx"
    desc_df = pd.read_excel(sim_file)
    desc_df['video_name'] = desc_df['video_name'].str.replace('.jpg', '', regex=False)

    # Merge ground truth with similarity data
    merged_df = pd.merge(anno_df, desc_df, left_on='Image Name', right_on='video_name')


    
    # Split by type with same ratio
    train_list = []
    test_list = []
    
    for label in ['Abrasions', 'Ingrown Nails', 'Stab Wound', 'Bruises', 'Cut', 'Laceration', 'Burns']:
        subset = merged_df[merged_df['Label'] == label]
        train, test = train_test_split(subset, test_size=0.2, random_state=42)
        train_list.append(train)
        test_list.append(test)
    
        
    # Combine and drop columns
    train_df = pd.concat(train_list).reset_index(drop=True).drop(columns=['Label', 'Image Name'])
    test_df = pd.concat(test_list).reset_index(drop=True).drop(columns=['Label','Image Name'])
    
    # Save results
    train_df.to_excel(f'intermediate data/train_dataset_{suffix}.xlsx', index=False)
    test_df.to_excel(f'intermediate data/test_dataset_{suffix}.xlsx', index=False)
    
    


In [None]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold

suffix = "reas"

# Load files
anno_df = pd.read_excel('../wound_groundtruth.xlsx')

for cat_name in ['Abrasions', 'Ingrown_Nails', 'Stab_Wound', 'Bruises', 'Cut', 'Laceration', 'Burns']:

    
    desc_df = pd.read_excel(f'intermediate data/train_dataset_reasoningUS_anomaly_{cat_name}_reas.xlsx')
    
    # Clean video names
    anno_df['Image Name'] = anno_df['Image Name'].str.replace('.jpg', '', regex=False)
    desc_df['video_name'] = desc_df['video_name'].str.replace('.jpg', '', regex=False)
    
    # Merge datasets
    merged_df = pd.merge(anno_df, desc_df, left_on='Image Name', right_on='video_name')
    
    # Optional: Create a copy of Label for stratification
    merged_df['strat_label'] = merged_df['Label']
    
    # Initialize StratifiedKFold
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    # Generate and save each training fold
    for fold, (train_idx, val_idx) in enumerate(skf.split(merged_df, merged_df['strat_label'])):
        train_fold = merged_df.iloc[train_idx].reset_index(drop=True)
        # Drop unnecessary columns
        train_fold = train_fold.drop(columns=['Label', 'Image Name', 'strat_label'])
    
        # Save to Excel
        train_fold.to_excel(f'train_dataset_fold{fold}_{suffix}_{cat_name}.xlsx', index=False)
    
    print("✅ 5-fold training splits saved.")


In [None]:
import numpy as np
import pandas as pd

class PMF:
    def __init__(self, num_users, num_items, num_factors, learning_rate=0.01, reg_param=0.01):
        self.num_users = num_users
        self.num_items = num_items
        self.num_factors = num_factors
        self.learning_rate = learning_rate
        self.reg_param = reg_param
        # Initialize U and V here, so they can be reset for grid search
        self.U = np.random.normal(scale=1./self.num_factors, size=(self.num_users, self.num_factors))
        self.V = np.random.normal(scale=1./self.num_factors, size=(self.num_items, self.num_factors))

    def train(self, train_ratings, val_ratings, num_epochs=100, patience=5):
        # train_ratings and val_ratings are lists of [i, j, rating]
        
        best_val_rmse = float('inf')
        epochs_without_improvement = 0
        
        # Training using SGD
        for epoch in range(num_epochs):
            # Shuffle training data for each epoch
            np.random.shuffle(train_ratings)
            
            for i, j, r_ij in train_ratings:
                i, j = int(i), int(j)
                prediction = self.predict(i, j)
                error = r_ij - prediction

                # Update latent factors
                u_i = self.U[i, :].copy() # Make a copy before update
                v_j = self.V[j, :].copy() # Make a copy before update
                
                self.U[i, :] += self.learning_rate * (error * v_j - self.reg_param * u_i)
                self.V[j, :] += self.learning_rate * (error * u_i - self.reg_param * v_j)

            # --- Validation Phase ---
            val_rmse = self.compute_rmse(val_ratings)
            print(f'Epoch: {epoch+1}, Validation RMSE: {val_rmse:.4f}')

            # --- Early Stopping Logic ---
            if val_rmse < best_val_rmse:
                best_val_rmse = val_rmse
                epochs_without_improvement = 0
                # Optionally save the best model weights
                best_U, best_V = self.U.copy(), self.V.copy()
            else:
                epochs_without_improvement += 1
            
            if epochs_without_improvement >= patience:
                print(f"Stopping early after {epoch+1} epochs.")
                self.U, self.V = best_U, best_V # Restore best weights
                break
        
        return best_val_rmse

    def predict(self, i, j):
        return np.dot(self.U[i, :], self.V[j, :])

    def compute_rmse(self, ratings_data):
        # ratings_data is a list of [i, j, rating]
        if len(ratings_data) == 0:
            return 0.0
            
        error = 0
        for i, j, r_ij in ratings_data:
            i, j = int(i), int(j)
            error += (r_ij - self.predict(i, j)) ** 2
        
        # Return Root Mean Squared Error
        return np.sqrt(error / len(ratings_data))

    def full_matrix(self):
        return np.dot(self.U, self.V.T)

In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error

# --- Main pipeline ---
data_dir = "./"  # change if needed
os.makedirs("low list", exist_ok=True)

for fold in range(5):
    suffix = f"fold{fold}_reas"
    train_file = os.path.join(data_dir, f"train_dataset_{suffix}.xlsx")
    test_file = os.path.join(data_dir, f"test_dataset_reas.xlsx")
   
    df = pd.read_excel(train_file, sheet_name="Sheet1")
    R = df.select_dtypes(include=[np.number]).fillna(0).values
    xs, ys = R.nonzero()
    ratings = np.c_[xs, ys, R[xs, ys]]
    np.random.seed(42)
    np.random.shuffle(ratings)
    split_index = int(0.8 * len(ratings))
    train_ratings = ratings[:split_index]
    val_ratings = ratings[split_index:]
    num_users, num_items = R.shape

    param_grid = {
        'num_factors': [5, 10, 15],
        'learning_rate': [0.005, 0.01, 0.05],
        'reg_param': [0.01, 0.1, 0.5]
    }

    best_rmse = float('inf')
    for f in param_grid['num_factors']:
        for lr in param_grid['learning_rate']:
            for reg in param_grid['reg_param']:
                pmf = PMF(num_users, num_items, f, lr, reg)
                rmse = pmf.train(train_ratings, val_ratings, num_epochs=100, patience=5)
                if rmse < best_rmse:
                    best_rmse = rmse
                    best_model = pmf

    V = best_model.V
    V_df = pd.DataFrame(V)
    V_df.to_excel(f"best_V_matrix_{suffix}.xlsx", index=False)

    test_df = pd.read_excel(test_file)
    test_numeric = test_df.select_dtypes(include=[np.number]).apply(lambda x: x.fillna(x.mean()), axis=0).values
    video_names = test_df['video_name'].astype(str).values

    mse_list = []
    c_list = []
    for s in test_numeric:
        c, _, _, _ = np.linalg.lstsq(V, s, rcond=None)
        mse = mean_squared_error(s, V @ c)
        mse_list.append(mse)
        c_list.append(c)

    result_df = pd.DataFrame(c_list, columns=[f'c{i}' for i in range(V.shape[1])])
    result_df['MSE'] = mse_list
    result_df.insert(0, 'video_name', video_names)
    result_df.to_excel(f'optimal_c_and_mse_lstsq_with_name_{suffix}.xlsx', index=False)


In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error



for cat_name in ['Abrasions', 'Ingrown_Nails', 'Stab_Wound', 'Bruises', 'Cut', 'Laceration', 'Burns']:

    
    
    for fold in range(5):
        suffix = f"fold{fold}_reas"
        
        # Load files
        V_df = pd.read_excel(f'best_V_matrix_{suffix}.xlsx')
        test_df = pd.read_excel(f'intermediate data/test_dataset_reasoningUS_anomaly_{cat_name}_reas.xlsx')
      
        
        # Prepare matrix V and numeric part of test dataset
        V = V_df.values
        test_numeric_df = test_df.select_dtypes(include=[np.number])
        
        # Fill missing values with column mean
        test_numeric_df = test_numeric_df.apply(lambda x: x.fillna(x.mean()), axis=0)
        S_numeric = test_numeric_df.values
        
        # Extract video names (assumes column 'video_name' exists)
        video_names = test_df['video_name'].values
        
        # Solve for c minimizing ||s - Vc||^2 using least squares
        mse_list = []
        c_list = []
        
        for s in S_numeric:
            c, residuals, _, _ = np.linalg.lstsq(V, s, rcond=None)
            s_pred = V @ c
            mse = mean_squared_error(s, s_pred)
            mse_list.append(mse)
            c_list.append(c)
        
        # Output results with video names
        result_df = pd.DataFrame(c_list, columns=[f'c{i}' for i in range(V.shape[1])])
        result_df['MSE'] = mse_list
        result_df.insert(0, 'video_name', video_names)
        
        result_df.to_excel(f'optimal_c_and_mse_lstsq_with_name_{suffix}_{cat_name}.xlsx', index=False)
    


In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error


for fold in range(5):
    suffix = f"fold{fold}_reas"
    
    # Load files
    V_df = pd.read_excel(f'best_V_matrix_{suffix}.xlsx')
    test_df = pd.read_excel(f'train_dataset_{suffix}.xlsx')
  
    
    # Prepare matrix V and numeric part of test dataset
    V = V_df.values
    test_numeric_df = test_df.select_dtypes(include=[np.number])
    
    # Fill missing values with column mean
    test_numeric_df = test_numeric_df.apply(lambda x: x.fillna(x.mean()), axis=0)
    S_numeric = test_numeric_df.values
    
    # Extract video names (assumes column 'video_name' exists)
    video_names = test_df['video_name'].values
    
    # Solve for c minimizing ||s - Vc||^2 using least squares
    mse_list = []
    c_list = []
    
    for s in S_numeric:
        c, residuals, _, _ = np.linalg.lstsq(V, s, rcond=None)
        s_pred = V @ c
        mse = mean_squared_error(s, s_pred)
        mse_list.append(mse)
        c_list.append(c)
    
    # Output results with video names
    result_df = pd.DataFrame(c_list, columns=[f'c{i}' for i in range(V.shape[1])])
    result_df['MSE'] = mse_list
    result_df.insert(0, 'video_name', video_names)
    
    result_df.to_excel(f'optimal_c_and_mse_lstsq_with_name_{suffix}_train.xlsx', index=False)



In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error



for cat_name in ['Abrasions', 'Ingrown_Nails', 'Stab_Wound', 'Bruises', 'Cut', 'Laceration', 'Burns']:

    
    
    for fold in range(5):
        suffix = f"fold{fold}_reas"
        
        # Load files
        V_df = pd.read_excel(f'best_V_matrix_{suffix}.xlsx')
        test_df = pd.read_excel(f'intermediate data/train_dataset_{suffix}_{cat_name}.xlsx')
      
        
        # Prepare matrix V and numeric part of test dataset
        V = V_df.values
        test_numeric_df = test_df.select_dtypes(include=[np.number])
        
        # Fill missing values with column mean
        test_numeric_df = test_numeric_df.apply(lambda x: x.fillna(x.mean()), axis=0)
        S_numeric = test_numeric_df.values
        
        # Extract video names (assumes column 'video_name' exists)
        video_names = test_df['video_name'].values
        
        # Solve for c minimizing ||s - Vc||^2 using least squares
        mse_list = []
        c_list = []
        
        for s in S_numeric:
            c, residuals, _, _ = np.linalg.lstsq(V, s, rcond=None)
            s_pred = V @ c
            mse = mean_squared_error(s, s_pred)
            mse_list.append(mse)
            c_list.append(c)
        
        # Output results with video names
        result_df = pd.DataFrame(c_list, columns=[f'c{i}' for i in range(V.shape[1])])
        result_df['MSE'] = mse_list
        result_df.insert(0, 'video_name', video_names)
        
        result_df.to_excel(f'optimal_c_and_mse_lstsq_with_name_{suffix}_train_{cat_name}.xlsx', index=False)
    


In [None]:
import pandas as pd

# Load the provided files
test_df = pd.read_excel('intermediate data/test_dataset_reasoningUS_anomaly_Cut_reas.xlsx')
anomalies_df = pd.read_excel('embedded_only_video_anomalies_reas.xlsx')

# Remove .mp4 suffix for consistency
test_df['video_name'] = test_df['video_name'].str.replace('.jpg', '', regex=False)
anomalies_df['video_name'] = anomalies_df['video_name'].str.replace('.jpg', '', regex=False)

# Identify all anomaly columns for different models
anomaly_cols = [col for col in anomalies_df.columns if col.endswith('-anomaly')]

# Keep relevant columns and reshape to long format
anomaly_data = anomalies_df[['video_name'] + anomaly_cols]
melted = anomaly_data.melt(id_vars='video_name', value_vars=anomaly_cols, 
                           var_name='model', value_name='anomaly')

# Filter to videos in the test set
filtered_melted = melted[melted['video_name'].isin(test_df['video_name'])]

# Group by video_name and calculate percentage of anomaly = 1 and 0
summary = filtered_melted.groupby('video_name')['anomaly'].value_counts(normalize=True).unstack(fill_value=0)
# summary = summary.rename(columns={0: 'percentage_anomaly_0', 1: 'percentage_anomaly_1'}).reset_index()

# Rename category columns to 'percentage_{category}'
summary = summary.rename(columns={col: f'percentage_{col}' for col in summary.columns if col != 'video_name'}).reset_index()

# Save result to Excel
summary.to_excel('anomaly_percentage_summary.xlsx', index=False)


In [None]:
import pandas as pd



for fold in range(5):
    suffix = f"fold{fold}_reas"


    
    # Load the provided files
    test_df = pd.read_excel(f'intermediate data/train_dataset_{suffix}_Cut.xlsx')
    anomalies_df = pd.read_excel('embedded_only_video_anomalies_reas.xlsx')
    
    # Remove .mp4 suffix for consistency
    test_df['video_name'] = test_df['video_name'].str.replace('.jpg', '', regex=False)
    anomalies_df['video_name'] = anomalies_df['video_name'].str.replace('.jpg', '', regex=False)
    
    # Identify all anomaly columns for different models
    anomaly_cols = [col for col in anomalies_df.columns if col.endswith('-anomaly')]
    
    # Keep relevant columns and reshape to long format
    anomaly_data = anomalies_df[['video_name'] + anomaly_cols]
    melted = anomaly_data.melt(id_vars='video_name', value_vars=anomaly_cols, 
                               var_name='model', value_name='anomaly')
    
    # Filter to videos in the test set
    filtered_melted = melted[melted['video_name'].isin(test_df['video_name'])]
    
    # Group by video_name and calculate percentage of anomaly = 1 and 0
    summary = filtered_melted.groupby('video_name')['anomaly'].value_counts(normalize=True).unstack(fill_value=0)
    # summary = summary.rename(columns={0: 'percentage_anomaly_0', 1: 'percentage_anomaly_1'}).reset_index()
    
    # Rename category columns to 'percentage_{category}'
    summary = summary.rename(columns={col: f'percentage_{col}' for col in summary.columns if col != 'video_name'}).reset_index()

    # Save result to Excel
    summary.to_excel(f'anomaly_percentage_summary_{suffix}_train.xlsx', index=False)


In [None]:
import pandas as pd

for fold in range(5):
    suffix = f"fold{fold}_reas"

    
    # List of 7 categories
    categories = ["Abrasions", "Ingrown Nails", "Stab Wound", "Bruises", "Cut", "Laceration", "Burns"]
    
    # Load summary file and clean video names
    summary_df = pd.read_excel('anomaly_percentage_summary.xlsx')
    summary_df['video_name'] = summary_df['video_name'].str.replace('.jpg', '', regex=False)
    
    # Initialize merged DataFrame
    merged = summary_df.copy()
    
    # Loop through categories and merge MSEs
    for category in categories:
        #cat_key = category.replace(' ', ' ')
        cat_key = category.replace(' ', '_')
        
        mse_file = f'optimal_c_and_mse_lstsq_with_name_{suffix}_{cat_key}.xlsx'
        
        mse_df = pd.read_excel(mse_file)
        mse_df['video_name'] = mse_df['video_name'].str.replace('.jpg', '', regex=False)
        merged = merged.merge(mse_df[['video_name', 'MSE']], on='video_name', how='left')
        
        # Rename MSE column to be category-specific
        merged = merged.rename(columns={'MSE': f'MSE_difference_{cat_key}'})
    
    # Compute weighted MSE
    merged['weighted_MSE'] = 0
    for category in categories:
        cat_key = category.replace(' ', '_')

        #cat_key = category
        pct_col = f'percentage_{category}'
        mse_col = f'MSE_difference_{cat_key}'
        #mse_col = f'MSE_difference_{category}'
        #if pct_col in merged.columns and mse_col in merged.columns:
        merged['weighted_MSE'] += merged[pct_col] * merged[mse_col]
    
    # Save result
    merged[['video_name', 'weighted_MSE']].to_excel(f'weighted_mse_summary_{suffix}.xlsx', index=False)


In [None]:
import pandas as pd

for fold in range(5):
    suffix = f"fold{fold}_reas"

    
    # List of 7 categories
    categories = ["Abrasions", "Ingrown Nails", "Stab Wound", "Bruises", "Cut", "Laceration", "Burns"]
    
    # Load summary file and clean video names
    summary_df = pd.read_excel(f'anomaly_percentage_summary_{suffix}_train.xlsx')
    summary_df['video_name'] = summary_df['video_name'].str.replace('.jpg', '', regex=False)
    
    # Initialize merged DataFrame
    merged = summary_df.copy()
    
    # Loop through categories and merge MSEs
    for category in categories:
        #cat_key = category.replace(' ', ' ')
        cat_key = category.replace(' ', '_')
        
        mse_file = f'optimal_c_and_mse_lstsq_with_name_{suffix}_train_{cat_key}.xlsx'
        
        mse_df = pd.read_excel(mse_file)
        mse_df['video_name'] = mse_df['video_name'].str.replace('.jpg', '', regex=False)
        merged = merged.merge(mse_df[['video_name', 'MSE']], on='video_name', how='left')
        
        # Rename MSE column to be category-specific
        merged = merged.rename(columns={'MSE': f'MSE_difference_{cat_key}'})
    
    # Compute weighted MSE
    merged['weighted_MSE'] = 0
    for category in categories:
        cat_key = category.replace(' ', '_')

        #cat_key = category
        pct_col = f'percentage_{category}'
        mse_col = f'MSE_difference_{cat_key}'
        #mse_col = f'MSE_difference_{category}'
        #if pct_col in merged.columns and mse_col in merged.columns:
        merged['weighted_MSE'] += merged[pct_col] * merged[mse_col]
    
    # Save result
    merged[['video_name', 'weighted_MSE']].to_excel(f'weighted_mse_summary_{suffix}_train.xlsx', index=False)


In [None]:
import pandas as pd

for fold in range(5):
    suffix = f"fold{fold}_reas"

    
    # Load the two Excel files
    df_desc = pd.read_excel(f'weighted_mse_summary_{suffix}.xlsx')
    df_desc_reas = pd.read_excel(f'optimal_c_and_mse_lstsq_with_name_{suffix}.xlsx')
    
    # Ensure video_name has no .mp4 suffix if present
    df_desc['video_name'] = df_desc['video_name'].str.replace('.mp4', '', regex=False)
    df_desc_reas['video_name'] = df_desc_reas['video_name'].str.replace('.mp4', '', regex=False)
    
    # Merge on video_name
    merged_df = pd.merge(df_desc, df_desc_reas, on='video_name')
    
    # Calculate MSE difference
    merged_df['MSE_difference'] = merged_df['weighted_MSE'] - merged_df['MSE']
    
    # Output result
    merged_df[['video_name', 'MSE_difference']].to_excel(f'mse_difference_reasoningUS_{suffix}.xlsx', index=False)


In [None]:
import pandas as pd

for fold in range(5):
    suffix = f"fold{fold}_reas"

    
    # Load the two Excel files
    df_desc = pd.read_excel(f'weighted_mse_summary_{suffix}_train.xlsx')
    df_desc_reas = pd.read_excel(f'optimal_c_and_mse_lstsq_with_name_{suffix}_train.xlsx')
    
    # Ensure video_name has no .mp4 suffix if present
    df_desc['video_name'] = df_desc['video_name'].str.replace('.mp4', '', regex=False)
    df_desc_reas['video_name'] = df_desc_reas['video_name'].str.replace('.mp4', '', regex=False)
    
    # Merge on video_name
    merged_df = pd.merge(df_desc, df_desc_reas, on='video_name')
    
    # Calculate MSE difference
    merged_df['MSE_difference'] = merged_df['weighted_MSE'] - merged_df['MSE']
    
    # Output result
    merged_df[['video_name', 'MSE_difference']].to_excel(f'mse_difference_reasoningUS_{suffix}_train.xlsx', index=False)


In [None]:
import pandas as pd
import numpy as np



for fold in range(5):
    suffix = f"fold{fold}_reas"


    
    df = pd.read_excel(f'mse_difference_reasoningUS_{suffix}.xlsx')
    
    
    
    
    video_names = df["video_name"].astype(str)
    uncertainty_scores = df["MSE_difference"].values
    
    # Percent of videos to exclude as high-uncertainty
    P_set = [5, 10, 15, 20, 25, 30, 35, 40]
    
    # Process and save low-uncertainty videos
    for P in P_set:
        tau = np.percentile(uncertainty_scores, 100 - P)
        mask_low = uncertainty_scores <= tau
        S_low_videos = video_names[mask_low]
    
        out_df = pd.DataFrame({
            "video_name": S_low_videos.values,
            "uncertainty": uncertainty_scores[mask_low]
        })
        
        out_df.to_excel(f"low list/S_low_videos_{P}_{suffix}.xlsx", index=False)
        print(f"Threshold τ = {tau:.4f}")
        print(f"Kept {len(S_low_videos)}/{len(video_names)} videos in S_low.")


In [None]:
import pandas as pd
import numpy as np



for fold in range(5):
    suffix = f"fold{fold}_reas"


    
    df = pd.read_excel(f'mse_difference_reasoningUS_{suffix}_train.xlsx')
    
    
    
    
    video_names = df["video_name"].astype(str)
    uncertainty_scores = df["MSE_difference"].values
    
    # Percent of videos to exclude as high-uncertainty
    P_set = [5, 10, 15, 20, 25, 30, 35, 40]
    
    # Process and save low-uncertainty videos
    for P in P_set:
        tau = np.percentile(uncertainty_scores, 100 - P)
        mask_low = uncertainty_scores <= tau
        S_low_videos = video_names[mask_low]
    
        out_df = pd.DataFrame({
            "video_name": S_low_videos.values,
            "uncertainty": uncertainty_scores[mask_low]
        })
        
        out_df.to_excel(f"low list/S_low_videos_{P}_{suffix}_train.xlsx", index=False)
        print(f"Threshold τ = {tau:.4f}")
        print(f"Kept {len(S_low_videos)}/{len(video_names)} videos in S_low.")


In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Model names
model_names = [
    "claude3.5sonnet",
    "claude3.5",
    "claude3.7",
    "gpt4o-mini",
    "gpt4o"
]
# P values representing percent of videos excluded as high-uncertainty
P_set = [5, 10, 15, 20, 25, 30, 35, 40]


for fold in range(5):
    suffix = f"fold{fold}_reas"


    
    # Store results for each P
    all_results = []
    
    for P in P_set:
        
        try:
            low = pd.read_excel(f'low list/S_low_videos_{P}_{suffix}.xlsx')
        except Exception as e:
            print(f"[Error] Failed to load S_low_videos_{P}_trace.xlsx: {e}")
            continue
        
        low_list = low['video_name'].str.replace('.mp4', '', regex=False)
        
        # Store predictions per model
        model_preds = {}
        
        for model in model_names:
            try:
                df = pd.read_csv(f'../model_predictions/vad_results_{model}.csv')
                df['Video Name'] = df['Video Name'].str.replace('.jpg', '', regex=False)
                df = df[df['Video Name'].isin(low_list)]
                model_preds[model] = df[['Video Name', 'Predicted Label']].set_index('Video Name')
            except Exception as e:
                print(f"[Error] Failed to process {model}: {e}")
        
        # Merge all predictions
        merged = pd.DataFrame(index=low_list)
        for model in model_names:
            merged[model] = model_preds.get(model, pd.DataFrame()).reindex(low_list)['Predicted Label']
        
        # Drop rows with missing predictions
        merged = merged.dropna()
        
        if merged.empty:
            print(f"[Warning] No valid data for P={P}, skipping.")
            continue
        
        # Load ground truth from claude-3-5-sonnet
        gt_df = pd.read_csv('../model_predictions/vad_results_claude3.5sonnet.csv')
        gt_df['Video Name'] = gt_df['Video Name'].str.replace('.mp4', '', regex=False)
        gt_df = gt_df[gt_df['Video Name'].isin(merged.index)]
        ground_truth = gt_df.set_index('Video Name')['True Label']
        
        # Majority voting
        majority_vote = merged.mode(axis=1)[0]
        
        # Align ground truth
        y_true = ground_truth.loc[merged.index]
        
        # Compute metrics
        acc = accuracy_score(y_true, majority_vote)
        prec = precision_score(y_true, majority_vote,  average='macro',zero_division=0)
        rec = recall_score(y_true, majority_vote, average='macro', zero_division=0)
        f1 = f1_score(y_true, majority_vote, average='macro', zero_division=0)
        
        all_results.append({
            'P': P,
            'Num Videos': len(merged),
            'Accuracy': acc,
            'Precision': prec,
            'Recall': rec,
            'F1-Score': f1
        })
        
        print(f"[P={P}] Processed {len(merged)} videos | Accuracy: {acc:.4f}, F1-Score: {f1:.4f}")
    
    # Save final results to one Excel file
    result_df = pd.DataFrame(all_results)
    result_df.to_excel(f'low list/VAD_Majority_Voting_Summary_{suffix}.xlsx', index=False)
    print("[Saved] VAD_Majority_Voting_Summary.xlsx")


In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Model names
model_names = [
    "claude3.5sonnet",
    "claude3.5",
    "claude3.7",
    "gpt4o-mini",
    "gpt4o"
]
# P values representing percent of videos excluded as high-uncertainty
P_set = [5, 10, 15, 20, 25, 30, 35, 40]


for fold in range(5):
    suffix = f"fold{fold}_reas"


    
    # Store results for each P
    all_results = []
    
    for P in P_set:
        
        try:
            low = pd.read_excel(f'low list/S_low_videos_{P}_{suffix}_train.xlsx')
        except Exception as e:
            print(f"[Error] Failed to load S_low_videos_{P}_trace.xlsx: {e}")
            continue
        
        low_list = low['video_name'].str.replace('.mp4', '', regex=False)
        
        # Store predictions per model
        model_preds = {}
        
        for model in model_names:
            try:
                df = pd.read_csv(f'../model_predictions/vad_results_{model}.csv')
                df['Video Name'] = df['Video Name'].str.replace('.jpg', '', regex=False)
                df = df[df['Video Name'].isin(low_list)]
                model_preds[model] = df[['Video Name', 'Predicted Label']].set_index('Video Name')
            except Exception as e:
                print(f"[Error] Failed to process {model}: {e}")
        
        # Merge all predictions
        merged = pd.DataFrame(index=low_list)
        for model in model_names:
            merged[model] = model_preds.get(model, pd.DataFrame()).reindex(low_list)['Predicted Label']
        
        # Drop rows with missing predictions
        merged = merged.dropna()
        
        if merged.empty:
            print(f"[Warning] No valid data for P={P}, skipping.")
            continue
        
        # Load ground truth from claude-3-5-sonnet
        gt_df = pd.read_csv('../model_predictions/vad_results_claude3.5sonnet.csv')
        gt_df['Video Name'] = gt_df['Video Name'].str.replace('.mp4', '', regex=False)
        gt_df = gt_df[gt_df['Video Name'].isin(merged.index)]
        ground_truth = gt_df.set_index('Video Name')['True Label']
        
        # Majority voting
        majority_vote = merged.mode(axis=1)[0]
        
        # Align ground truth
        y_true = ground_truth.loc[merged.index]
        
        # Compute metrics
        acc = accuracy_score(y_true, majority_vote)
        prec = precision_score(y_true, majority_vote,  average='macro',zero_division=0)
        rec = recall_score(y_true, majority_vote, average='macro', zero_division=0)
        f1 = f1_score(y_true, majority_vote, average='macro', zero_division=0)
        
        all_results.append({
            'P': P,
            'Num Videos': len(merged),
            'Accuracy': acc,
            'Precision': prec,
            'Recall': rec,
            'F1-Score': f1
        })
        
        print(f"[P={P}] Processed {len(merged)} videos | Accuracy: {acc:.4f}, F1-Score: {f1:.4f}")
    
    # Save final results to one Excel file
    result_df = pd.DataFrame(all_results)
    result_df.to_excel(f'low list/VAD_Majority_Voting_Summary_{suffix}_train.xlsx', index=False)
    print("[Saved] VAD_Majority_Voting_Summary.xlsx")


In [None]:
import pandas as pd



for fold in range(5):
    suffix = f"fold{fold}_reas"


    
    # Load the summary files
    majority_df = pd.read_excel(f'low list/VAD_Majority_Voting_Summary_{suffix}.xlsx')
    
    # Extract metrics
    P_values = majority_df['P']
    overall_accuracy = majority_df['Accuracy']
    # recall = majority_df['Recall']
    # vague_abnormal_accuracy = vague_df['Accuracy']
    
    # Combine into final DataFrame
    out_df = pd.DataFrame({
        'P': P_values,
        'Overall Accuracy': overall_accuracy
        # 'Recall': recall,
        # 'Vague Abnormal Accuracy': vague_abnormal_accuracy
    })
    
    # Remove final two rows
    out_df = out_df.iloc[:, :]
    
    # Save to xlsx
    out_df.to_excel(f'low list/results_P_{suffix}.xlsx', index=False)
