In [22]:
import os
import torch
os.environ["CUDA_VISIBLE_DEVICES"]="0"
print(torch.cuda.is_available())

True


In [23]:
import pandas as pd
import csv
import gc
import random
from sentence_transformers import SentenceTransformer, util
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [29]:
STARTCSV_PATH = '/root/workspace/MAINFOLDER/IMPUTATION/DATA/THRESHOLD50/imputed_CSV_fodorszagatsS'
STARTTXT = 'fodorszagatsS'
FP_OUTPUT = STARTCSV_PATH + '/' + STARTTXT + '_quality_imputation.csv'
COLUMNS = 7  
cumulative_percentages = [0.1, 0.2, 0.3, 0.4] 
seeds = random.sample(range(1, 10000), 30)

In [None]:
# Functions
def find_missing_value_indices(df):
    missing_indices = (df.iloc[:, :-1][df.iloc[:, :-1].isnull().any(axis=1)].index + 1).tolist()
    return missing_indices

def count_nulls_and_imputed(df, imputed_df):
    null_counts = df.isnull().sum()
    imputed_counts = (df.isnull() & ~imputed_df.isnull()).sum()
    still_null_counts = (df.isnull() & imputed_df.isnull()).sum()

    return null_counts, imputed_counts, still_null_counts

def compute_semantic_similarity(ground_truth_df, imputed_df, non_imputed_df, missing_indices, model_name="all-MiniLM-L6-v2"):
    model = SentenceTransformer(model_name)

    similarity_scores = {}
    column_avg_scores = {}

    for idx in missing_indices:
        zero_based_idx = idx - 1
        missing_columns = non_imputed_df.iloc[zero_based_idx, :-1].isnull()

        column_scores = {}
        for col_idx, is_missing in enumerate(missing_columns):
            if is_missing:
                column_name = ground_truth_df.columns[col_idx]
                ground_truth_value = str(ground_truth_df.iloc[zero_based_idx, col_idx])
                imputed_value = str(imputed_df.iloc[zero_based_idx, col_idx])
                if imputed_value != "nan" and imputed_value != "None" and imputed_value != "null":
                    embedding_ground_truth = model.encode(ground_truth_value, convert_to_tensor=True)
                    embedding_imputed = model.encode(imputed_value, convert_to_tensor=True)

                    similarity = util.pytorch_cos_sim(embedding_ground_truth, embedding_imputed).item()
                    column_scores[column_name] = similarity

                    if column_name not in column_avg_scores:
                        column_avg_scores[column_name] = []
                    column_avg_scores[column_name].append(similarity)

        similarity_scores[idx] = column_scores
    column_avg_scores = {col: sum(scores) / len(scores) for col, scores in column_avg_scores.items()}

    return similarity_scores, column_avg_scores
csv_rows = []
total_files = len(seeds) * len(cumulative_percentages)
files_processed = 0
print(f"Total files to be processed: {total_files}\n")
for seed in seeds:
    for percentage in cumulative_percentages:
        imputed_path = f"{STARTCSV_PATH}/{STARTTXT}_{percentage}_{seed}_imputed.csv"
        non_imputed_path = f"{STARTCSV_PATH}/{STARTTXT}_{percentage}_{seed}_nonimputed.csv"
        ground_truth_path = f"{STARTCSV_PATH}/{STARTTXT}_gt.csv"
        ground_truth_df = pd.read_csv(ground_truth_path)
        imputed_df = pd.read_csv(imputed_path)
        non_imputed_df = pd.read_csv(non_imputed_path)
        null_counts, imputed_counts, still_null_counts = count_nulls_and_imputed(non_imputed_df, imputed_df)
        missing_indices = find_missing_value_indices(non_imputed_df)
        similarity_scores, column_avg_scores = compute_semantic_similarity(ground_truth_df, imputed_df, non_imputed_df, missing_indices)
        row = {
            'dataset': STARTTXT,
            'seed': seed,
            'percentage': percentage
        }
        total_similarity = 0
        column_count = 0

        for col_name in ground_truth_df.columns[:-1]:  
            row[f'{col_name}_Total_Nulls'] = null_counts[col_name]
            row[f'{col_name}_Imputed'] = imputed_counts[col_name]
            row[f'{col_name}_Still_Nulls'] = still_null_counts[col_name]
            similarity = column_avg_scores.get(col_name, 0)
            row[f'{col_name}_similarity'] = similarity
            
            if similarity > 0:
                total_similarity += similarity
                column_count += 1
        average_similarity = total_similarity / column_count if column_count > 0 else 0
        row['average_similarity'] = average_similarity
        csv_rows.append(row)
        files_processed += 1
        print(f"Processed {files_processed}/{total_files} files")
csv_df = pd.DataFrame(csv_rows)

# Save the results to CSV
csv_df.to_csv(FP_OUTPUT, index=False)

print(f"Results saved to {FP_OUTPUT}")
