In [229]:
#Sort Different columns: 
import pandas as pd 
import numpy as np
from qiskit_aer import AerSimulator
from qiskit import QuantumCircuit, transpile
from qiskit_aer.primitives import Sampler
import re
import openpyxl

df = pd.read_excel('Files/All.xlsx')
type = detectColumns(df,df.columns)
print(type)

{'STATUS': {'0': 90, '1': 9}, 'Employee_ID': {'1': 100}, 'Start_Date': {'1': 100}, 'Student_Names': {'1': 60, '0': 39}, 'Roll No.': {'0': 69, '1': 30}, 'Grade': {'0': 89, '1': 10}, 'f2': {'1': 20, '0': 79}, 'year': {'0': 80, '1': 19}}


In [228]:
def measurCir(i,j):
    i.measure(j,j)
    simulator = AerSimulator()
    # Transpile & run
    compiled = transpile(i, simulator)
    r = simulator.run(compiled, shots=100000).result()
    counts = r.get_counts()
    for k,v in counts.items():
        counts[k] = int(v/1000)
    return counts

In [231]:
from qiskit import QuantumCircuit
import pandas as pd
import math
def detectColumns(df, prioColumns):
    result = {}    
    for col in prioColumns:
        qc =  QuantumCircuit(1,1)
        col_data = df[col]
        col_str = df[col].astype(str).str.strip()
         # 3. Check for Date values (type 2)
        if check_date_format(col_str):
            print("hii")
            p = 0.70000
                
            # 4. Check for DateTime values (type 3)
        elif check_datetime_format(col_str):
            p = 0.800000
            
        elif OneOr2digitDetection(col_data):
            p = 0.100000
        # 1. Check for Roll Numbers (type 4)
        elif pd.api.types.is_numeric_dtype(col_data):
            p = 0.2000
            if check_roll_number(col_data):
                p = 0.30000
            # 2. Check for Year values (type 1)
            elif check_year_values(col_data):
                p = 0.40000

        elif pd.api.types.is_string_dtype(df[col]) or pd.api.types.is_object_dtype(df[col]):
            p = 0.600000
            if detectIdTypeCol(col_data):
                p = 1
        
        angle = 2 * math.asin(math.sqrt(p))
        qc.ry(angle, 0)
        # Initialize result as 0 (unrecognized type)
        result[col] = measurCir(qc,0)
        
    return result

def check_roll_number(col_data):
    try:
        # Convert numbers to strings for checking patterns
        sample_start = [str(i) for i in col_data.head(10)]
        sample_middle = [str(i) for i in col_data.iloc[int(len(col_data)/2)-5:int(len(col_data)/2)+5]]
        sample_end = [str(i) for i in col_data.iloc[-10:]]
        
        # Combine samples
        samples = sample_start + sample_middle + sample_end
        
        # Check if all numbers have the same length and >= 5 digits
        if len(set(len(str(x)) for x in samples)) == 1:
            length = len(str(samples[0]))
            if length >= 5:
                # Check if all numbers start with the same digit
                first_digit = str(samples[0])[0]
                return all(str(x).startswith(first_digit) for x in samples)
    except:
        pass
    return False

def check_year_values(col_data):
    if pd.api.types.is_string_dtype(col_data):
        try:
            col_data = pd.to_numeric(col_data)
        except:
            pass
    # Handle if column is numeric and looks like a year
    if pd.api.types.is_numeric_dtype(col_data) or  pd.api.types.is_float_dtype(col_data) or  pd.api.types.is_integer_dtype(col_data):
        if col_data.dropna().empty == False:
            if pd.api.types.is_float_dtype(col_data):
            # Check if float values have only 2 decimal places
                if col_data.dropna().apply(lambda x: round(x, 2) == x).all():
                    if col_data.dropna().between(1800, 2050).all():
                        return True # Only year
            # For integer values
            elif col_data.dropna().between(1800, 2100).all():
                True # Only year
                
    return False

def check_date_format(col_str):
    # Check for common date formats (yyyy-mm-dd, dd-mm-yyyy, etc.)
    date_pattern = r'^\d{1,4}[-/\.]\d{1,2}[-/\.]\d{1,4}$'
    return col_str.str.match(date_pattern).all()

def check_datetime_format(col_str):
    # Check for datetime format (date + time)
    datetime_pattern = r'\d{1,4}[-/\.]\d{1,2}[-/\.]\d{1,4}.*\d{1,2}:\d{2}'
    return col_str.str.contains(datetime_pattern).all()

def detectIdTypeCol(col_data):
    if pd.api.types.is_string_dtype(col_data):
        pattern = r'\b[A-Z0-9]{1,4}[-_./]?[A-Z0-9]{2,6}[-_./]?[A-Z0-9]{0,5}\b'
        return all(re.fullmatch(pattern, item) for item in col_data)
    
def OneOr2digitDetection(col_data):
    try:
        if all(len(str(i))<=2 for i in col_data):
            return True
    except:
        pass
    return False

def detectSimpleDtypes(col_data):
    if pd.api.types.is_integer_dtype(col_data):
        return 'allInt'
    if pd.api.types.is_float_dtype(col_data):
        if col_data.isna().all()== False:
            return 'numaric'
    if pd.api.types.is_string_dtype(col_data) or pd.api.types.is_object_dtype(col_data):
        return 'str'
    return None

def clusterDateTimeCol(fContent, col,no,ascending=True):
    if no ==1:
        # Try to detect and sort if the column is just year values
        fContent = fContent.sort_values(by=col,ignore_index=True, ascending=ascending)
        fContent = fContent.reset_index(drop=True)
    elif no == 2 or 3:
        # Now, try to detect proper date/datetime columns
        try:
            # Avoid processing numeric-only or zero-filled columns
            sample_vals = fContent[col].astype(str).str.strip().replace('0', np.nan).dropna()
            if len(sample_vals) == 0:
                return fContent# All values are zero or empty-like
            
            # Try parsing
            try:
                parsed_col = pd.to_datetime(fContent[col], errors='raise')
            except:
                parsed_col = pd.to_datetime(fContent[col], dayfirst=True, errors='raise')
               
            if all(parsed_col.dt.time == pd.to_datetime('00:00:00').time()) and no == 2:  # Only date
                fContent[col] = parsed_col
                fContent = clean_and_sort_date_column(fContent, col, ascending)
                fContent = fContent.reset_index(drop=True)
                # Replace original column with parsed datetime values
            elif no == 3:  # Date + time
                fContent[col] = parsed_col
                fContent = handle_datetime_column(fContent, col, ascending)
                fContent = fContent.reset_index(drop=True)
        except Exception as e:
            return fContent # Not a datetime column
        
    return fContent 

def clean_and_sort_date_column(dff, column_name, ascending=True):
        try:
            
            # Step 2: Drop NaT (invalid formats)
            dff = dff.dropna(subset=[column_name])
            
            # Step 3: Sort the DataFrame by that column
            dff = dff.sort_values(by=column_name, ascending=ascending)

            # Optional: Format to clean date string (YYYY-MM-DD)
            dff[column_name] = dff[column_name].dt.strftime('%Y-%m-%d')
            print(f"{column_name} date called ")
            return dff
        
        except Exception as e:
            print(f"⚠️ Error while processing date column: {e}")
            return dff

def handle_datetime_column(df, column_name, ascending):
    print(f"{column_name} dateTime")
    # Check if most values in column are datetime with time
    values = df[column_name].dropna().astype(str).head(20)
    count_datetime = sum([pd.api.is_datetime(v) for v in values])

    if count_datetime >= len(values) // 2:  # At least half must be datetime-like
        # Convert full column to datetime
        df[column_name] = pd.to_datetime(df[column_name], errors='coerce')
        # Drop rows with invalid dates
        df = df.dropna(subset=[column_name])
        # Sort by that column
        df = df.sort_values(by=column_name,ascending=ascending).reset_index(drop=True)
        print(f"[INFO] '{column_name}' successfully recognized and sorted as datetime.")
    else:
        print(f"[INFO] '{column_name}' does not contain proper datetime with time.")

    return df

In [171]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from qiskit import QuantumCircuit, QuantumRegister, ClassicalRegister
from qiskit_aer import Aer
import time

# Cache for storing quantum circuits to avoid recreating them
circuit_cache = {}

def create_oracle(values, target_idx, num_qubits):
    st = time.time()
    cache_key = f'oracle_{target_idx}_{num_qubits}'
    if cache_key in circuit_cache:
        return circuit_cache[cache_key]

    oracle = QuantumCircuit(num_qubits)
    for i in range(num_qubits):
        if (target_idx >> i) & 1:
            oracle.x(i)
    
    if num_qubits == 1:
        oracle.h(0)
        oracle.z(0)
        oracle.h(0)
    elif num_qubits > 3:
        mid = num_qubits // 2
        oracle.h(num_qubits - 1)
        oracle.mcx(list(range(mid)), mid)
        oracle.mcx(list(range(mid, num_qubits - 1)), num_qubits - 1)
        oracle.h(num_qubits - 1)
    else:
        oracle.h(num_qubits - 1)
        if num_qubits == 2:
            oracle.cx(0, 1)
        else:
            oracle.mcx(list(range(num_qubits - 1)), num_qubits - 1)
    
    for i in range(num_qubits):
        if (target_idx >> i) & 1:
            oracle.x(i)
    circuit_cache[cache_key] = oracle
    et = time.time()
    # print(f"create_oracle time = {et-st}")
    return oracle

def create_diffusion(num_qubits):
    st = time.time()
    cache_key = f'diffusion_{num_qubits}'
    if cache_key in circuit_cache:
        return circuit_cache[cache_key]

    diffusion = QuantumCircuit(num_qubits + 1)
    for qubit in range(num_qubits):
        diffusion.h(qubit)
    for qubit in range(num_qubits):
        diffusion.x(qubit)
    chunk_size = 3
    for i in range(0, num_qubits - 1, chunk_size):
        control_qubits = list(range(i, min(i + chunk_size, num_qubits - 1)))
        if len(control_qubits) > 0:
            diffusion.h(num_qubits)
            diffusion.mcx(control_qubits, num_qubits)
            diffusion.h(num_qubits)
    for qubit in range(num_qubits):
        diffusion.x(qubit)
    for qubit in range(num_qubits):
        diffusion.h(qubit)
    circuit_cache[cache_key] = diffusion
    et = time.time()
    # print(f"create_diffusion = {et-st}")
    return diffusion

def grover_find_min_index(values):
    st = time.time()
    n = len(values)
    num_bits = max(1, int(np.ceil(np.log2(n))))
    min_idx = np.argmin(values)
    
    qr = QuantumRegister(num_bits + 1, 'q')
    cr = ClassicalRegister(num_bits, 'c')
    circuit = QuantumCircuit(qr, cr)
    
    for i in range(num_bits):
        circuit.h(qr[i])
    
    iterations = int(np.pi/4 * np.sqrt(2**num_bits))
    oracle = create_oracle(values, min_idx, num_bits + 1)
    diffusion = create_diffusion(num_bits)
    
    for _ in range(iterations):
        circuit = circuit.compose(oracle)
        circuit = circuit.compose(diffusion)
    
    for i in range(num_bits):
        circuit.measure(qr[i], cr[i])
    
    backend = Aer.get_backend('aer_simulator')
    result = backend.run(circuit, shots=1000).result()
    counts = result.get_counts()
    max_count_result = max(counts.items(), key=lambda x: x[1])[0]
    et = time.time()
    # print(f" grover find min index time = {et -st}")

    return int(max_count_result, 2) % n

def quantum_sort_cluster(cluster_df, sort_column):
    st=time.time()
    if len(cluster_df) == 0:
        return df
    
    df = cluster_df.copy()
    sorted_indices = []
    values = df[sort_column].tolist()
    remaining_indices = list(range(len(values)))
    
    while remaining_indices:
        remaining_values = [values[i] for i in remaining_indices]
        min_idx = grover_find_min_index(remaining_values)
        actual_idx = remaining_indices[min_idx]
        sorted_indices.append(actual_idx)
        remaining_indices.remove(actual_idx)
    
    et=time.time()
    # print(f"quantum_sort_cluster time = {et -st}")
    return df.iloc[sorted_indices].reset_index(drop=True)

def cluster_based_quantum_sort(df, Pcols, n_clusters=None, i=0,j=0):
        if i >= len(Pcols):
            return df
        start_time = time.time()
        # print(f'{i} times')
        sort_column = Pcols[i]
        # Read and preprocess data
        
        if sort_column not in df.columns:
            print(f"Column '{sort_column}' not found.")
            return
        
        # print("Original Data:\n", df)
        
        # Perform clustering
        clustering_data = df[[sort_column]]
        if n_clusters == None:
            kmeans = KMeans(n_clusters = int(np.ceil(len(df[sort_column])/60)), random_state=42)
        else: 
            kmeans = KMeans(n_clusters=n_clusters,random_state=42)
            
        df['cluster'] = kmeans.fit_predict(clustering_data)
        
        # Get unique clusters and sort them to ensure consistent processing order
        unique_clusters = sorted(df['cluster'].unique())#it is storing uniq cluster ids 
        print(len(unique_clusters))
        all_sorted = []
        
        # Process each cluster exactly once with synchronized messages
        for cluster_id in unique_clusters:
            cluster_df = df[df['cluster'] == cluster_id].drop(columns=['cluster'])
            cluster_size = len(cluster_df)
            print(f"\nProcessing Cluster {cluster_id} (size {cluster_size})")
            
            # Process the cluster
            sorted_cluster = quantum_sort_cluster(cluster_df, sort_column)
            cluster_based_quantum_sort(pd.DataFrame(sorted_cluster),Pcols,i=i+1,j=j+1)
            all_sorted.append(sorted_cluster)
            
            # Print completion message for the current cluster only
            print(f"Completed Cluster {cluster_id}")
        
        
        # Combine all sorted clusters
        merged_df = pd.concat(all_sorted, ignore_index=True)
        final_sorted_df = merged_df.sort_values(by=sort_column).reset_index(drop=True)
        
        end_time = time.time()
        # print(f"Total execution time: {end_time - start_time} seconds")
        
        return final_sorted_df
    
if __name__ == "__main__":
    circuit_cache.clear()
    df = pd.read_csv('Files/Iris - all-numbers.csv')
    df = cluster_based_quantum_sort(df, ['5.1','3.5'], n_clusters=20)
    print(df)
    

20

Processing Cluster 0 (size 16)
1

Processing Cluster 0 (size 16)
Completed Cluster 0
Completed Cluster 0

Processing Cluster 1 (size 12)
1

Processing Cluster 0 (size 12)
Completed Cluster 0
Completed Cluster 1

Processing Cluster 2 (size 6)
1

Processing Cluster 0 (size 6)
Completed Cluster 0
Completed Cluster 2

Processing Cluster 3 (size 7)
1

Processing Cluster 0 (size 7)
Completed Cluster 0
Completed Cluster 3

Processing Cluster 4 (size 7)
1

Processing Cluster 0 (size 7)
Completed Cluster 0
Completed Cluster 4

Processing Cluster 5 (size 2)
1

Processing Cluster 0 (size 2)
Completed Cluster 0
Completed Cluster 5

Processing Cluster 6 (size 7)
1

Processing Cluster 0 (size 7)
Completed Cluster 0
Completed Cluster 6

Processing Cluster 7 (size 7)
1

Processing Cluster 0 (size 7)
Completed Cluster 0
Completed Cluster 7

Processing Cluster 8 (size 9)
1

Processing Cluster 0 (size 9)
Completed Cluster 0
Completed Cluster 8

Processing Cluster 9 (size 10)
1

Processing Cluster 0 (

In [None]:
import pandas as pd
import numpy as np
import time
from sklearn.cluster import KMeans

# Assuming quantum_sort_cluster is already defined
def cluster_based_quantum_sort(df, Pcols, n_clusters=None, i=0, j=0):
    if i >= len(Pcols):
        return df

    sort_column = Pcols[i]
    if sort_column not in df.columns:
        print(f"Column '{sort_column}' not found.")
        return df

    print(f'\nLevel {i}: Sorting by column "{sort_column}"')
    start_time = time.time()

    # Perform clustering on the current sort_column
    clustering_data = df[[sort_column]]
    if n_clusters is None:
        n_clusters = int(np.ceil(len(df[sort_column]) / 60))
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    df['cluster'] = kmeans.fit_predict(clustering_data)

    unique_clusters = sorted(df['cluster'].unique())
    all_sorted = []

    for cluster_id in unique_clusters:
        cluster_df = df[df['cluster'] == cluster_id].drop(columns=['cluster'])
        print(f"  → Cluster {cluster_id} (size {len(cluster_df)})")

        # Sort this cluster
        sorted_cluster = quantum_sort_cluster(cluster_df, sort_column)

        # Recursively sort the next column (if available)
        next_sorted_cluster = cluster_based_quantum_sort(pd.DataFrame(sorted_cluster), Pcols, i=i+1, j=j+1)
        all_sorted.append(next_sorted_cluster)

        print(f"  ✓ Completed Cluster {cluster_id}")

    # Merge and sort by current column
    merged_df = pd.concat(all_sorted, ignore_index=True)
    final_sorted_df = merged_df.sort_values(by=sort_column).reset_index(drop=True)

    end_time = time.time()
    # print(f"✔ Level {i} sorting by '{sort_column}' completed in {end_time - start_time:.2f} seconds")

    return final_sorted_df

# Entry point
if __name__ == "__main__":
    circuit_cache.clear()
    df = pd.read_csv('Files/phpB0xrNjAllNum.csv')
    df = cluster_based_quantum_sort(df, ['f1', 'f2', 'f3'])
    print(df)



Level 0: Sorting by column "f1"
  → Cluster 0 (size 83)

Level 1: Sorting by column "f2"
  → Cluster 0 (size 60)

Level 2: Sorting by column "f3"
  → Cluster 0 (size 60)
  ✓ Completed Cluster 0
  ✓ Completed Cluster 0
  → Cluster 1 (size 23)

Level 2: Sorting by column "f3"
  → Cluster 0 (size 23)
  ✓ Completed Cluster 0
  ✓ Completed Cluster 1
  ✓ Completed Cluster 0
  → Cluster 1 (size 73)

Level 1: Sorting by column "f2"
  → Cluster 0 (size 37)

Level 2: Sorting by column "f3"
  → Cluster 0 (size 37)
  ✓ Completed Cluster 0
  ✓ Completed Cluster 0
  → Cluster 1 (size 36)

Level 2: Sorting by column "f3"
  → Cluster 0 (size 36)
  ✓ Completed Cluster 0
  ✓ Completed Cluster 1
  ✓ Completed Cluster 1
  → Cluster 2 (size 31)

Level 1: Sorting by column "f2"
  → Cluster 0 (size 31)

Level 2: Sorting by column "f3"
  → Cluster 0 (size 31)
  ✓ Completed Cluster 0
  ✓ Completed Cluster 0
  ✓ Completed Cluster 2
  → Cluster 3 (size 31)

Level 1: Sorting by column "f2"
  → Cluster 0 (size 31

In [None]:

def detectRollNoCol(df):  
    result = {}
    for column in df.columns:
        col_data = df[column]
        if (pd.api.types.is_integer_dtype(col_data)) : # main thing for being roll no
                stringRoll = {}
                #trying to convert the int type into string type
                try:
                    # they are array of type string
                    stringRoll['strS'] =[str(i) for i in (col_data.head(10))]#first 10 nums
                    stringRoll['strM'] = [str(i) for i in col_data.iloc[int(len(col_data)/2)-5: int(len(col_data)/2)+5]]#mid 10 nums in string
                    stringRoll['strE'] =  [str(i) for i in col_data.iloc[len(col_data)-10:len(col_data)]]
                    arr = np.array(list(stringRoll.values()))
                    arr = (arr.flatten())
                    lenOfEachEleInKeys= {}
                    for i in stringRoll.keys():#this is iterating for keys 
                        for j in stringRoll[i]:# this is iterating for 10 values in each keys 
                            if (len(j) == len(stringRoll[i][1])) and len(j) >= 5: #checking for each roll if they are of same length
                                lenOfEachEleInKeys[i] = len(j)
                                # print("almost")
                            else:
                                result [column] =0
                                continue 
                            
                    if len(set(lenOfEachEleInKeys.values())) == 1:
                                isSame = all((x.startswith(arr[1][0])) for x in arr)
                                print(isSame)
                                if isSame:
                                    result[column] = 4
                                    print("done")
                                    continue
                except:
                    print("except")
                    result[column] = 0
                    continue
        else:
            print("Not int")
            result[column] = 0
            continue
            
    return result

df = pd.read_csv('student_dataset.csv')
r = detectRollNoCol(df)
print(r)

Not int
False
Not int
Not int
True
done
Not int
Not int
{'Student_Names': 0, 'Phone_No.': 0, 'Math': 0, 'Physics': 0, 'Chemistry': 0, 'Grade': 0, 'Comment': 0, 'Roll No.': 4, 'School Name': 0, 'Student Address': 0}


In [59]:
import pandas as pd
import re
def detectIdTypeCol(col_data,column):
    if pd.api.types.is_string_dtype(col_data):
        pattern = r'\b[A-Z0-9]{1,4}[-_./]?[A-Z0-9]{2,6}[-_./]?[A-Z0-9]{0,5}\b'
        if all(re.fullmatch(pattern, item) for item in col_data):
            return True
    return False
    #     print(column)
    #     try:
    #         # Convert numbers to strings for checking patterns
    #         sample_start = [str(i) for i in col_data.head(10)]
    #         sample_middle = [str(i) for i in col_data.iloc[int(len(col_data)/2)-5:int(len(col_data)/2)+5]]
    #         sample_end = [str(i) for i in col_data.iloc[-10:]]
            
    #         # Combine samples
    #         samples = sample_start + sample_middle + sample_end
    #         a =0
            
    #         if len(set(len(str(x)) for x in samples)) == 1: 
    #             for j in samples[0]:
    #                 for i in samples:
    #                     if j in i:
    #                         a = a+1
    #         print(a)
    #     except:
    #         pass
    # return False
        
dff = pd.read_csv('Files/student_dataset.csv')
r = {}
for col in dff.columns:
    r[col] = detectIdTypeCol(dff[col],col)
    
print(r)


{'Student_Names': False, 'Phone_No.': False, 'Math': False, 'Physics': False, 'Chemistry': False, 'Grade': False, 'Comment': False, 'Roll No.': False, 'School Name': False, 'Student Address': False}


In [None]:
#singlle or double digit detection
import pandas as pd
def OneOr2digitDetection(col_data):
    try:
        if all(len(str(i))<=2 for i in col_data):
            return True
    except:
        pass
    return False

df = pd.read_csv('Files/student_dataset.csv')
r = {}
for col in df.columns:
    r[col] =  OneOr2digitDetection(df[col])
    
print(r)

{'Student_Names': False, 'Phone_No.': False, 'Math': False, 'Physics': False, 'Chemistry': False, 'Grade': True, 'Comment': False, 'Roll No.': False, 'School Name': False, 'Student Address': False}


In [None]:

df = pd.read_csv('Files/annual-enterprise-survey-2023-financial-year-provisional-size-bands.csv')
r = {}
for col in df.columns:
    r[col] =  detectSimpleDtypes(df[col])
    
print(r)

float64
object
object
object
object
object
object
{'year': 'numaric', 'industry_code_ANZSIC': 'str', 'industry_name_ANZSIC': 'str', 'rme_size_grp': 'str', 'variable': 'str', 'value': 'str', 'unit': 'str'}


In [5]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
import time

def classical_sort_cluster(cluster_df, sort_column):
    return cluster_df.sort_values(by=sort_column).reset_index(drop=True)

def classical_cluster_based_sort(input_csv, sort_column, n_clusters=4):
    start_time = time.time()
    
    df = pd.read_csv(input_csv)
    df = df.dropna()
    
    if sort_column not in df.columns:
        print(f"Column '{sort_column}' not found.")
        return
    
    print("Original Data:\n", df)
    
    clustering_data = df[[sort_column]]
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    df['cluster'] = kmeans.fit_predict(clustering_data)
    
    all_sorted = []
    for cluster_id in range(n_clusters):
        cluster_df = df[df['cluster'] == cluster_id].drop(columns=['cluster'])
        print(f"\nSorting Cluster {cluster_id} (size {len(cluster_df)}):")
        sorted_cluster = classical_sort_cluster(cluster_df, sort_column)
        all_sorted.append(sorted_cluster)
    
    merged_df = pd.concat(all_sorted, ignore_index=True)
    final_sorted_df = merged_df.sort_values(by=sort_column).reset_index(drop=True)
    
    print("\nFinal Sorted Data:")
    print(final_sorted_df)
    
    
    end_time = time.time()
    total_time = end_time - start_time
    print(f"Total time for classical implementation = {total_time} seconds")

if __name__ == "__main__":
    classical_cluster_based_sort('phpB0xrNj.csv',sort_column="f3", n_clusters=20)

Original Data:
           f1      f2      f3      f4      f5      f6      f7    f596    f597  \
0    -0.4394 -0.0930  0.1718  0.4620  0.6226  0.4704  0.3578  0.6410  0.6154   
1    -0.4348 -0.1198  0.2474  0.4036  0.5026  0.6328  0.4948  1.0000  0.7272   
2    -0.2330  0.2124  0.5014  0.5222 -0.3422 -0.5840 -0.7168  0.2380  0.1904   
3    -0.3808 -0.0096  0.2602  0.2554 -0.4290 -0.6746 -0.6868  0.5252  0.3670   
4    -0.3412  0.0946  0.6082  0.6216 -0.1622 -0.3784 -0.4324  0.4688  0.5626   
...      ...     ...     ...     ...     ...     ...     ...     ...     ...   
1487 -0.2232  0.1542  0.3394  0.3720  0.5100  0.5970  0.3104  0.5068  0.3698   
1488 -0.2552  0.0776  0.1948  0.5122  0.6522  0.6258  0.4934  0.1818  0.3454   
1489 -0.3188 -0.0318  0.1354  0.2988  0.7132  0.6374  0.5140 -0.1276  0.4042   
1490 -0.3636 -0.1448  0.3064  0.4074  0.5320  0.6262  0.3670 -0.0176  0.2280   
1491 -0.3236  0.0522  0.5156  0.9832  1.0000  0.4488  0.8038  0.1070  0.1572   

        f598  ...    f6

In [85]:

def isDfTotallyIntFloat(df):
    return all(pd.api.types.is_numeric_dtype(dtype)  for dtype in df.dtypes)
    
df = pd.read_csv('student_dataset.csv')
type = isDfTotallyIntFloat(df)
print(type)

False


In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from qiskit_aer import Aer
from qiskit_algorithms import Grover, AmplificationProblem
from qiskit.circuit.library import PhaseOracle
from qiskit.utils import QuantumInstance

# ----- Quantum Minimum Finding -----

def index_to_bin(index, num_bits):
    return format(index, f'0{num_bits}b')

def create_oracle_expression(min_index, num_bits):
    bin_index = index_to_bin(min_index, num_bits)
    expr = ' & '.join([f"{'' if bit == '1' else '~'}x{i}" for i, bit in enumerate(bin_index)])
    return expr

def grover_find_min_index(values):
    n = len(values)
    num_bits = int(np.ceil(np.log2(n)))
    padded_length = 2 ** num_bits

    padded_values = values + [float('inf')] * (padded_length - n)
    min_index = np.argmin(padded_values)

    oracle_expr = create_oracle_expression(min_index, num_bits)
    oracle = PhaseOracle(oracle_expr)
    problem = AmplificationProblem(oracle)

    backend = Aer.get_backend("aer_simulator")
    grover = Grover()
    result = grover.amplify(problem, quantum_instance=QuantumInstance(backend))

    measured_index = max(result.circuit_results.items(), key=lambda x: x[1])[0]
    return int(measured_index, 2)

def quantum_sort_cluster(cluster_df, sort_column):
    df = cluster_df.copy().reset_index(drop=True)
    sorted_rows = []

    while not df.empty:
        values = df[sort_column].tolist()
        min_idx = grover_find_min_index(values)
        sorted_rows.append(df.loc[min_idx])
        df = df.drop(min_idx).reset_index(drop=True)

    return pd.DataFrame(sorted_rows)

# ----- Main Cluster-Based Hybrid Sort -----

def cluster_based_quantum_sort(input_csv, sort_column, n_clusters=2, output_csv='cluster_sorted.csv'):
    df = pd.read_csv(input_csv)
    
    if sort_column not in df.columns:
        print(f"Column '{sort_column}' not found.")
        return

    print("Original Data:\n", df)

    # Clustering
    clustering_data = df[[sort_column]]
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    df['cluster'] = kmeans.fit_predict(clustering_data)

    all_sorted = []

    for cluster_id in range(n_clusters):
        cluster_df = df[df['cluster'] == cluster_id].drop(columns=['cluster'])
        print(f"\nSorting Cluster {cluster_id} (size {len(cluster_df)}):")
        sorted_cluster = quantum_sort_cluster(cluster_df, sort_column)
        all_sorted.append(sorted_cluster)

    # Combine clusters and final classical sort
    merged_df = pd.concat(all_sorted, ignore_index=True)
    final_sorted_df = merged_df.sort_values(by=sort_column).reset_index(drop=True)

    print("\nFinal Sorted Data:")
    print(final_sorted_df)

    final_sorted_df.to_csv(output_csv, index=False)
    print(f"\nSorted data saved to '{output_csv}'.")

# Example usage
if __name__ == "__main__":
    cluster_based_quantum_sort("data.csv", sort_column="score", n_clusters=2)



ImportError: cannot import name 'BaseSampler' from 'qiskit.primitives' (c:\Users\tikes\OneDrive\Documents\OneDrive\Desktop\ClonedProject\Tikesh01.github.io\.venv\Lib\site-packages\qiskit\primitives\__init__.py)