In [None]:
import numpy as np
import pandas as pd
import h5py

In [None]:
def read_hdf5_file(file_path,dataset_name='similarity_matrix3'):
    with h5py.File(file_path,'r')as f:
        data=f[dataset_name][:]
    return data
file_path='similarity_matrix3.h5'
similarity_matrix=read_hdf5_file(file_path)
similarity_matrix=pd.DataFrame(similarity_matrix)
similarity_matrix.shape

In [None]:
similarity_matrix.head()

In [None]:
def evaluate(row):
    top_indices = row.nlargest(21).index 
    result = pd.Series(np.nan, index=row.index)  
    result[top_indices] = row[top_indices]  
    return result

In [None]:
def compute(similarity_matrix, batch_size=1000, output_file='similarity_preprocess5.h5'):
    num_samples = similarity_matrix.shape[0]
    
    with h5py.File(output_file, 'w') as f:
         dset = f.create_dataset(
            "similarity_preprocess5",
            shape=(num_samples, similarity_matrix.shape[1]),
            dtype=np.float32
        )

        for i in range(0, num_samples, batch_size):
            batch = similarity_matrix.iloc[i:i+batch_size]
            batch_result = batch.apply(lambda x: evaluate(x), axis=1)
            dset[i:i+batch_size, :] = batch_result.values
            print(f"Processed batch {i} to {i+batch_size}")

    print(f"Data successfully saved to {output_file}")
compute(similarity_matrix, batch_size=1000, output_file='similarity_preprocess5.h5')


In [None]:
def read_and_replace_and_save(file_path, output_file_path, dataset_name='similarity_preprocess5', chunk_size=1000):
    with h5py.File(file_path, 'r') as f_in, h5py.File(output_file_path, 'w') as f_out:
        dataset = f_in[dataset_name]
        n_rows = dataset.shape[0]
        for start in range(0, n_rows, chunk_size):
            end = min(start + chunk_size, n_rows)
            chunk = dataset[start:end, :]  
            nan_mask = np.isnan(chunk)
            chunk = np.where(nan_mask, chunk, np.arange(chunk.shape[1]))  
            chunk_df = pd.DataFrame(chunk)
            print(chunk_df.head()) 
            if start == 0:  
                f_out.create_dataset(dataset_name, data=chunk, maxshape=(None, chunk.shape[1]), chunks=True)
            else:
                f_out[dataset_name].resize((f_out[dataset_name].shape[0] + chunk.shape[0]), axis=0)
                f_out[dataset_name][-chunk.shape[0]:] = chunk

In [None]:
file_path = 'similarity_preprocess5.h5'
output_file_path = 'similarity_preprocess6.h5'
read_and_replace_and_save(file_path, output_file_path)


In [None]:
def shift_nan_right_in_row(df):
    return df.apply(lambda row: pd.Series(sorted(row, key=lambda x: (pd.isna(x), x))), axis=1)
input_file = 'similarity_preprocess6.h5'
output_file = 'similarity_preprocess8.h5'
chunk_size = 10000 
with pd.HDFStore(output_file, mode='w') as store:
    for chunk in pd.read_hdf(input_file, chunksize=chunk_size):
        chunk_shifted = shift_nan_right_in_row(chunk)
        
        store.append('data', chunk_shifted, index=False)

print(f"Processing complete. The output is saved to '{output_file}'.")



In [None]:
def drop_columns_with_nan_and_save_to_csv(input_file, output_csv_file):
    with h5py.File(input_file, 'r') as hf_in:
        cleaned_data_list = []
        
        for batch_name in hf_in.keys():
            batch_data = hf_in[batch_name][:]
            df = pd.DataFrame(batch_data)
            df_cleaned = df.dropna(axis=1, how='any')
            cleaned_data_list.append(df_cleaned)
        
        full_cleaned_data = pd.concat(cleaned_data_list, ignore_index=True)
        
        full_cleaned_data.to_csv(output_csv_file, index=False)
input_file = 'similarity_preprocess8.h5'
output_csv_file = 'similarity_preprocess8_cleaned.csv'
drop_columns_with_nan_and_save_to_csv(input_file, output_csv_file)

print(f"Processed data saved to {output_csv_file}.")
