**Quantum Parallel Duplicate Detection with Total Sum**

In [49]:
import pandas as pd
import numpy as np
from qiskit import QuantumCircuit, QuantumRegister, ClassicalRegister, transpile
from qiskit_aer import AerSimulator
import time
from concurrent.futures import ThreadPoolExecutor

# Read the sample data
df = pd.read_csv('employee_time_log.csv')

#how are you checking for duplicates without any duplicate fuction
def check_column_duplicates(column_data):
    # Count duplicates in a single column
    value_counts = column_data.value_counts()
    duplicates = sum(count - 1 for count in value_counts if count > 1)
    return duplicates

def quantum_parallel_duplicate_check(df):
    start_time = time.time()
    simulator = AerSimulator()
    
    # Number of columns to process in parallel
    num_columns = len(df.columns)
    
    # Create quantum circuit for parallel processing
    qr = QuantumRegister(num_columns, 'q')
    cr = ClassicalRegister(num_columns, 'c')
    qc = QuantumCircuit(qr, cr)
    
    # Apply Hadamard gates to create superposition
    qc.h(range(num_columns))
    
    # Use ThreadPoolExecutor for parallel classical computation
    column_results = {}
    total_duplicates = 0
    
    with ThreadPoolExecutor() as executor:
        # Map the duplicate checking function to all columns
        future_to_column = {
            executor.submit(check_column_duplicates, df[column]): column
            for column in df.columns
        }
        
        # Collect results as they complete
        for future in future_to_column:
            column = future_to_column[future]
            duplicates = future.result()
            column_results[column] = duplicates
            total_duplicates += duplicates
    
    # Add quantum interference
    for i in range(num_columns):
        if column_results[df.columns[i]] > 0:
            qc.x(i)  # Apply X gate if duplicates found
    
    # Measure all qubits
    qc.measure(range(num_columns), range(num_columns))
    
    # Run on simulator
    compiled_circuit = transpile(qc, simulator)
    job = simulator.run(compiled_circuit, shots=1024)
    # counts = job.result().get_counts()
    
    end_time = time.time()
    execution_time = end_time - start_time
    
    return {
        'column_results': column_results,
        'total_duplicates': total_duplicates,
        'execution_time': execution_time
    }

# Run the parallel quantum duplicate check
results = quantum_parallel_duplicate_check(df)

# Print results
print(f"Execution Time: {results['execution_time']:.4f} seconds\n")
print("Duplicate counts by column:")
for column, count in results['column_results'].items():
    print(f"{column}: {count} duplicates")
print(f"\nTotal duplicates across all columns: {results['total_duplicates']}")
print("\nQuantum Circuit Measurements:")

Execution Time: 0.0572 seconds

Duplicate counts by column:
Employee_ID: 65 duplicates
Task_ID: 6 duplicates
Start_Date: 939 duplicates
End_Date: 939 duplicates
Login_Time: 40 duplicates
Logout_Time: 31 duplicates

Total duplicates across all columns: 2020

Quantum Circuit Measurements:


In [50]:
# Let's look at some examples of duplicates in the First Name column
print("Example of duplicates in First Name:")
first_name_counts = df['First Name'].value_counts()
print("\nTop 5 most repeated first names and their counts:")
print(first_name_counts.head())

print("\nExample of rows with the most common first name:")
most_common_name = first_name_counts.index[0]
print(f"\nShowing all people named '{most_common_name}':")
print(df[df['First Name'] == most_common_name][['First Name', 'Last Name', 'City', 'Country']])

# Let's also look at Country duplicates since it has many
print("\nTop 5 most common countries and their counts:")
country_counts = df['Country'].value_counts()
print(country_counts.head())

Example of duplicates in First Name:


KeyError: 'First Name'