In [17]:
import os
import pandas as pd
import random



# Replace the paths below with your actual folder and file paths
input_folder1 = './Dataset/smartlog2018ssd'
input_folder2 = './Dataset/smartlog2019ssd'
reference_file = './Dataset/ssd_failure_label/ssd_failure_label.csv'
output_file = './Dataset/ssd_failure_label/failure_only.csv'



In [12]:
def extract_records(input_folder1, input_folder2, reference_file, output_file):
    # Read the reference file containing the records to extract
    reference_df = pd.read_csv(reference_file)
    reference_df['ds'] = pd.to_datetime(reference_df['failure_time'])
    reference_df['ds'] = reference_df['ds'].dt.strftime('%Y%m%d').astype('int64')
    # Create an empty DataFrame to store the extracted records
    extracted_records = pd.DataFrame()

    # Loop through the two input folders
    for folder in [input_folder1, input_folder2]:
        for file in os.listdir(folder):
            if file.endswith(".csv"):
                file_path = os.path.join(folder, file)
                # Read the CSV file from each folder
                df = pd.read_csv(file_path)
                # Merge the DataFrame with the reference DataFrame based on the specified attributes
                merged_df = pd.merge(df, reference_df, on=['model', 'disk_id','ds'], how='inner')
                # Append the matching records to the extracted_records DataFrame
                extracted_records = extracted_records.append(merged_df, ignore_index=True)

    # Write the extracted records to the output CSV file
    extracted_records.to_csv(output_file, index=False)

In [3]:
def extract_random_records(input_folder1, input_folder2, reference_file, output_file):
    # Read the reference file containing the records to extract
    reference_df = pd.read_csv(reference_file)
    reference_df['ds'] = pd.to_datetime(reference_df['failure_time'])
    reference_df['ds'] = reference_df['ds'].dt.strftime('%Y%m%d').astype('int64')
    # Create an empty DataFrame to store the extracted records
    extracted_records = pd.DataFrame()


    # Choose 1 random files from each folder
    files_from_folder1 = random.sample(os.listdir(input_folder1), 1)
    files_from_folder2 = random.sample(os.listdir(input_folder2), 1)

    # Loop through the chosen files from the first folder
    for file in files_from_folder1:
        if file.endswith(".csv"):
            file_path = os.path.join(input_folder1, file)
            # Read the CSV file from the first folder
            df = pd.read_csv(file_path)
            # Merge the DataFrame with the reference DataFrame based on the specified attributes
            merged_df = pd.merge(df, reference_df, on=['model', 'ds', 'disk_id'], how='left', indicator=True)
            # Append the records that are not in the reference file to the extracted_records DataFrame
            unmatched_rows = merged_df.loc[merged_df['_merge'] == 'left_only']
            extracted_records = extracted_records.append(unmatched_rows, ignore_index=True)

    # Loop through the chosen files from the second folder
    for file in files_from_folder2:
        if file.endswith(".csv"):
            file_path = os.path.join(input_folder2, file)
            # Read the CSV file from the second folder
            df = pd.read_csv(file_path)
            # Merge the DataFrame with the reference DataFrame based on the specified attributes
            merged_df = pd.merge(df, reference_df, on=['model', 'ds', 'disk_id'], how='left', indicator=True)
            # Append the records that are not in the reference file to the extracted_records DataFrame
            unmatched_rows = merged_df.loc[merged_df['_merge'] == 'left_only']
            extracted_records = extracted_records.append(unmatched_rows, ignore_index=True)

    # Drop the '_merge' column as it is no longer needed
    extracted_records.drop(columns='_merge', inplace=True)

    # Write the extracted records to the output CSV file
    extracted_records.to_csv(output_file, index=False)

In [4]:
reference_df = pd.read_csv(reference_file)

In [5]:
reference_df['ds'] = pd.to_datetime(reference_df['failure_time'])
reference_df['ds'] = reference_df['ds'].dt.strftime('%Y%m%d').astype('int64')
del reference_df['failure_time']

In [6]:
duplicates = reference_df.duplicated()

# Count the number of duplicates
num_duplicates = duplicates.sum()

In [7]:
num_duplicates

0

In [14]:
extract_records(input_folder1, input_folder2, reference_file, output_file)

  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)

  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)

  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)

  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)

  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)

  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)

  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)

  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)

  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)

  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)

  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)

  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)

  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)

  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)

  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)

  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)

  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)

  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)

  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)

  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)

  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)

  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)

  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)

  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)
  extracted_records = extracted_records.append(merged_df, ignore_index=True)

In [7]:
extract_random_records(input_folder1, input_folder2, reference_file, './Dataset/ssd_failure_label/working_only.csv')

  extracted_records = extracted_records.append(unmatched_rows, ignore_index=True)
  extracted_records = extracted_records.append(unmatched_rows, ignore_index=True)


In [22]:
def find_missing_records(output_file, reference_file):
    # Read the output file containing the extracted records
    extracted_records = pd.read_csv(output_file)

    # Read the reference file containing the records to extract
    reference_df = pd.read_csv(reference_file)

    # Convert the 'ds' column in reference_df to int64 to match the format in extracted_records
    reference_df['ds'] = pd.to_datetime(reference_df['ds']).dt.strftime('%Y%m%d').astype('int64')

    # Concatenate the reference_df and extracted_records DataFrames
    concatenated_df = pd.concat([reference_df, extracted_records], ignore_index=True)

    # Find duplicates based on the columns used for merging
    duplicates = concatenated_df.duplicated(subset=['model', 'disk_id', 'ds'], keep=False)

    # Filter out non-duplicates, which are the missing records
    missing_records = concatenated_df[~duplicates]

    return missing_records

In [26]:
missing_records = find_missing_records(output_file, reference_file)
print("Missing Records:")
missing_records

Missing Records:


Unnamed: 0,model,ds,disk_id,n_1,r_1,n_2,r_2,n_3,r_3,n_4,...,r_242,n_244,r_244,n_245,r_245,n_175,r_175,n_232,r_232,failure_time
0,MA2,20190322,4711,,,,,,,,...,,,,,,,,,,
2,MA2,20181217,32311,,,,,,,,...,,,,,,,,,,
3,MA2,20180519,18316,,,,,,,,...,,,,,,,,,,
4,MA2,20181025,32466,,,,,,,,...,,,,,,,,,,
5,MA2,20181220,30501,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22607,MB2,20191206,39426,,,,,,,,...,6.661730e+11,100.0,0.0,100.0,65535.0,,,,,2019-12-06 09:01:37
22608,MB1,20191206,33541,,,,,,,,...,1.330178e+12,100.0,0.0,100.0,65535.0,,,,,2019-12-06 09:00:57
22609,MB1,20191207,5438,,,,,,,,...,1.233507e+12,100.0,0.0,100.0,65535.0,,,,,2019-12-07 09:00:57
22610,MC1,20191207,51894,100.0,0.0,,,,,,...,,,,,,,,,,2019-12-07 09:07:29
