In [1]:
# Using environment Qiskit 1.2.0
!pip install scikit-learn



In [2]:
# Importing the required libraries (install in your environment first)
import numpy as np
import pandas as pd
from math import log2, sqrt
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_svmlight_file
from scipy.stats import chi2, binom
from scipy.fft import fft
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report, accuracy_score
import xgboost as xgb
from sklearn import svm
import ottoeplitz



In [3]:
def int_to_binary_string(n, length):
    n = int(n)
    return f"{n:0{length}b}"

def data_to_bitstring(data, N):
    strings = [int_to_binary_string(num, N) for num in data]
    return ''.join(strings)

def bitstring_to_int_array(bitstring, N):
    return np.array([int(bitstring[i:i+N], 2) for i in range(0, len(bitstring), N)])

def von_neumann_extractor(bitstring):
    extracted_bits = []
    
    # Iterate over the bitstring in pairs
    for i in range(0, len(bitstring) - 1, 2):
        pair = bitstring[i:i+2]
        
        # Only keep the result when the pair has different bits
        if pair == '01':
            extracted_bits.append('0')
        elif pair == '10':
            extracted_bits.append('1')
    
    # Return the extracted bitstring
    return ''.join(extracted_bits)

def von_neumann_int_array(int_array, N):
    bitstring = data_to_bitstring(int_array.flatten(), N)
    bitstring = von_neumann_extractor(bitstring)
    return bitstring_to_int_array(bitstring, N)

def entropize(inputdata, n):

    t = ottoeplitz.Toeplitz(inputdata, n)
    
    
    processed_data = t.hash()
    processed_data = von_neumann_int_array(processed_data, n)
    
    return processed_data

In [4]:
# Read in data from your datafile or the provided datafile in this folder to classify QRNG data. 
# Suggested classification strategies include QRNG vs PRNG, QPU vs Simulator, or by individual QPU

#Sample datafile of QRNG (IBM QPUs) vs PRNG data, 12k lines each. Label 1 is QRNG, label 2 is PRNG

# Read data file, make dataframe, process labels, and combine/concatenate individual input lines into
# larger input lines to create training and testing datasets to input into gradient boosting model.
# Hint: use train_test_split method from sklearn

def read_file_in_chunks(filename, N):
    with open(filename, 'r') as file:
        data = file.read().strip()  # Read the entire file into a single string and remove any trailing spaces or newlines
    return [data[i:i+N] for i in range(0, len(data), N)]  # Split the string into chunks of size N

# Read from RNG1 and RNG2
def get_df_from_files(filename1, filename2, N):
    rng1_bitstrings = read_file_in_chunks(filename1, N)
    small_bitstrings = bitstring_to_int_array(data_to_bitstring(rng1_bitstrings, N), 8)
    small_bitstrings = entropize(small_bitstrings, 8)
    rng1_bitstrings =  data_to_bitstring(small_bitstrings, 8)
    rng1_bitstrings = [rng1_bitstrings[i:i+N] for i in range(0, len(rng1_bitstrings), N)]
    rng2_bitstrings = read_file_in_chunks(filename2, N)
    
    # Create DataFrames for RNG1 and RNG2, and assign labels
    df_rng1 = pd.DataFrame({
        'bitstrings': rng1_bitstrings,
        'label': 1  # Label all RNG1 strings as '1'
    })
    
    df_rng2 = pd.DataFrame({
        'bitstrings': rng2_bitstrings,
        'label': 0  # Label all RNG2 strings as '2'
    })
    
    # Combine both DataFrames
    return pd.concat([df_rng1, df_rng2], ignore_index=True)

In [5]:
# Example DataFrame (assuming you already have it)
# df['bitstrings'] contains the bitstrings of length 100
# df = pd.DataFrame({'bitstrings': ['1100100100001111110110...', '1010101010101010101010...', ...]})

# Function to calculate Shannon entropy
def shannon_entropy(bitstring):
    # Convert bitstring to a numpy array of integers
    bits = np.array([int(bit) for bit in bitstring])
    
    # Calculate the frequency of 0's and 1's
    counts = np.bincount(bits)
    probabilities = counts / len(bits)
    
    # Filter out zero probabilities to avoid log2(0)
    probabilities = probabilities[probabilities > 0]
    
    # Calculate entropy
    entropy = -np.sum(probabilities * np.log2(probabilities))
    
    return entropy





In [6]:
def chi_squared_test(bits):
        counts = np.bincount(list(bits), minlength=2)
        observed = counts
        expected = np.array([len(bits)/2, len(bits)/2])
        chi_sq = np.sum((observed - expected) ** 2 / expected)
        # Degrees of freedom = number of categories - 1 = 1
        p_value = 1 - chi2.cdf(chi_sq, df=1)
        return chi_sq, p_value



In [7]:

# Function to get the top 5 dominant magnitudes of frequencies
def dominant_frequencies(bitstring, top_n=5):
    # Convert bitstring to list of integers (0s and 1s)
    bits = np.array([int(bit) for bit in bitstring])
    n = len(bits)  # Length of the bitstring

    # Convert bits to -1 and 1 for FFT
    signal = 2 * bits - 1
    fft_result = fft(signal)
    
    # Only take the positive frequencies
    freqs = np.fft.fftfreq(n)
    magnitudes = np.abs(fft_result)
    positive_freqs = freqs[:n//2]
    positive_magnitudes = magnitudes[:n//2]
    
    # Find the top_n dominant frequencies
    indices = np.argsort(positive_magnitudes)[-top_n:]
    dominant_mags = positive_magnitudes[indices]

    # Sort the dominant magnitudes in descending order
    sorted_mags = np.sort(dominant_mags)[::-1]
    
    # If fewer than top_n magnitudes, pad with NaNs
    if len(sorted_mags) < top_n:
        sorted_mags = np.pad(sorted_mags, (0, top_n - len(sorted_mags)), constant_values=np.nan)
    
    return sorted_mags




In [8]:
def autocorrelation(bitstring, lag=1):
    # Convert the bitstring to a numpy array of integers (0 and 1)
    bits = np.array([int(bit) for bit in bitstring])
    
    if lag >= len(bits):
        raise ValueError("Lag is too large for the bitstream length.")
    
    # Shift the bits by the given lag
    shifted = np.roll(bits, -lag)
    
    # Calculate correlation excluding the wrapped-around elements
    valid_length = len(bits) - lag
    correlation = np.corrcoef(bits[:valid_length], shifted[:valid_length])[0, 1]
    
    return correlation



In [9]:
def frequency_test(bitstring):
    """
    Perform the Frequency Test on a bitstring.
    
    :param bitstring: A 1D numpy array or list of bits (0s and 1s).
    :return: A tuple containing the number of 1s, number of 0s, and the test statistic.
    """
    count_ones = list(bitstring).count('1')
    count_zeros = len(bitstring) - count_ones
    
    return count_ones, count_zeros

def block_frequency_test(bitstring, M):
    """
    Perform the Block Frequency Test on a bitstring.
    
    :param bitstring: A 1D numpy array or list of bits (0s and 1s).
    :param M: Block size for the block frequency test.
    :return: The frequencies of 1s in each block and the test statistic.
    """
    bitstring = [int(c) for c in bitstring]
    n = len(bitstring)
    num_blocks = n // M
    block_frequencies = np.zeros(num_blocks)

    # Compute block frequencies
    for i in range(num_blocks):
        block = bitstring[i * M:(i + 1) * M]
        block_frequencies[i] = np.sum(block)

    # Calculate the mean and variance of the block frequencies
    mean_frequency = np.mean(block_frequencies)
    variance_frequency = np.var(block_frequencies)

    # The expected mean and variance for a random sequence
    expected_mean = M / 2
    expected_variance = M / 4

    # Chi-squared statistic
    chi_squared = ((mean_frequency - expected_mean) ** 2 / expected_variance) + \
                  (variance_frequency / expected_variance)

    return block_frequencies.max(), block_frequencies.mean(), chi_squared



In [10]:

def runs_test(bits):
    """
    Performs the Runs Test on a binary sequence.
    
    Parameters:
    bits (array-like): A binary sequence (1s and 0s).
    
    Returns:
    (int, float): The number of runs and the p-value of the test.
    """
    # Convert bits to a numpy array
    bits = np.array([int(c) for c in bits])
    
    # Count the number of runs
    runs = 1  # Start with the first run
    for i in range(1, len(bits)):
        if bits[i] != bits[i-1]:
            runs += 1
            
    n1 = np.sum(bits)  # Number of 1s
    n0 = len(bits) - n1  # Number of 0s
    
    # Calculate the expected number of runs and variance
    expected_runs = (2 * n1 * n0) / (n1 + n0) + 1
    variance_runs = (2 * n1 * n0 * (2 * n1 * n0 - n1 - n0)) / ((n1 + n0) ** 2 * (n1 + n0 - 1))
    
    # Z-score
    z = (runs - expected_runs) / np.sqrt(variance_runs)
    
    # Calculate p-value from z-score
    p_value = 1 - binom.cdf(runs, n=n1 + n0, p=0.5)  # Note: binom.cdf is not directly usable for z-scores
    
    return runs, p_value



In [11]:
def xor(a, b):
    """
    Returns the XOR of two binary values (0 or 1).
    
    Parameters:
    a (int): First binary value (0 or 1).
    b (int): Second binary value (0 or 1).
    
    Returns:
    int: Result of a XOR b (0 or 1).
    """
    return (a or b) and not (a and b)

def linear_complexity(bits):
    """
    Computes the Linear Complexity of a binary sequence using the Berlekamp-Massey algorithm.
    
    Parameters:
    bits (array-like): A binary sequence (1s and 0s).
    
    Returns:
    int: The linear complexity of the sequence.
    """
    bits = np.array([int(c) for c in bits])
    n = len(bits)
    l = 0  # Linear complexity
    m = -1  # Previous index where the error occurred
    C = np.zeros(n)
    B = np.zeros(n)
    C[0] = 1  # The polynomial is initialized with a leading coefficient of 1
    B[0] = 1
    
    for n in range(n):
        # Calculate the discrepancy
        discrepancy = bits[n]
        for i in range(1, l + 1):
            discrepancy = xor(discrepancy, (C[i] * bits[n - i]))
        
        if discrepancy == 1:  # An error occurred
            T = C.copy()
            for i in range(n - m):
                if m + i < n:
                    C[m + i] = xor(C[m + i], B[i])
            if l <= n // 2:
                l = n + 1 - l
                m = n
                B = T.copy()
    
    return l
    
# Normalize a specific column in the DataFrame (let's assume 'column_name' is the one you want to normalize)
def zscore_col(df, colname):
    df[colname] = (df[colname] - df[colname].mean()) / df[colname].std()



In [12]:
N = 100

QRNFiles = ["datasets/mod2sherbrooke.txt", "datasets/RNG1-2.txt"]
PRNFiles = [f"datasets/{f}" for f in ["python_random_bitstring.txt", "RNG2-2.txt", 
                                      "simulator_concatenation_100_10_1000bits.txt", "simulator_concatenation_100_10_1000bits.txt", "simulator_mod2_100_10_1000bits.txt"]]
for file1 in QRNFiles:
    for file2 in PRNFiles:
        df = get_df_from_files(file1, file2, N)
        print("Reading from: ", file1, file2)
        # Apply the entropy function to each bitstring in the DataFrame
        df['entropy'] = df['bitstrings'].apply(shannon_entropy)
        # Apply the function and store results in two new columns
        df[['chi_sq', 'chi_sq_p_value']] = pd.DataFrame(df['bitstrings'].apply(chi_squared_test).tolist(), index=df.index)

        # Apply the function and expand it into separate columns
        df[['freq_mag_1', 'freq_mag_2', 'freq_mag_3', 'freq_mag_4', 'freq_mag_5']] = pd.DataFrame(
            df['bitstrings'].apply(dominant_frequencies).tolist(), index=df.index
        )
                
        df['autocorrelation_lag1'] = df['bitstrings'].apply(autocorrelation)
        df['freq_ones'], df['freq_zeros'] = zip(*df['bitstrings'].apply(frequency_test))
        
        # Set block size for Block Frequency Test
        M = 10  # Example block size
        df['block_freqs_max'], df['block_freqs_mean'], df['block_freq_chi_squared'] = zip(*df['bitstrings'].apply(lambda x: block_frequency_test(x, M)))
        
        df['runs'], df['runs_p_value'] = zip(*df['bitstrings'].apply(runs_test))
        df['linear_complexity'] = df['bitstrings'].apply(linear_complexity)
         #metrics = ['linear_complexity', 'runs_p_value', 'runs', 'chi_sq','chi_sq_p_value','entropy', 'freq_ones', 'freq_zeros', 'block_freqs_max', 'block_freqs_mean', 'block_freq_chi_squared']
        metrics = ['freq_mag_1', 'freq_mag_2', 'freq_mag_3', 'freq_mag_4', 'freq_mag_5', 'autocorrelation_lag1', 'linear_complexity', 'runs_p_value', 'runs', 'chi_sq','chi_sq_p_value','entropy', 'freq_ones', 'freq_zeros', 'block_freqs_max', 'block_freqs_mean', 'block_freq_chi_squared']
        
        
        
        for colname in metrics:
            zscore_col(df, colname)
        
        # Calculate correlation
        correlation_matrix = df[metrics + ['label']].corr()
        correlation_with_target = correlation_matrix['label'].sort_values(ascending=False, key=abs)
        print(correlation_with_target)
        # Split the 'strings' column into separate columns for each character
        df_split = df['bitstrings'].str.split('', expand=True)
        
        # Fill NaN values with an empty string (optional)
        df_split.fillna('', inplace=True)
        
        # Concatenate the split DataFrame with the original DataFrame
        df_combined = pd.concat([df, df_split], axis=1)
        # Prepare input features (X) and target labels (y)
        X = pd.DataFrame(df[metrics])
        
        X = np.array(X)
        y = np.array(df['label'])
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
        
        clf = svm.SVC(kernel='rbf', C=0.7)
        clf.fit(X_train, y_train)
        
        y1 = clf.predict(X_train)
        accuracy = accuracy_score(y_train, y1)
        #print("Training accuracy: ", accuracy)
        
        # Make predictions on the test set
        y_pred_gb = clf.predict(X_test)
        
        # Calculate the accuracy of the Gradient Boosting model
        accuracy_gb = accuracy_score(y_test, y_pred_gb)
        print(f"Testing Accuracy For SVM: ", accuracy_gb)
        print('-' * 50)

Reading from:  datasets/mod2sherbrooke.txt datasets/python_random_bitstring.txt
label                     1.000000
linear_complexity        -0.375550
runs                     -0.171039
freq_ones                -0.138774
freq_mag_5               -0.132183
block_freqs_max           0.102282
autocorrelation_lag1      0.098389
block_freq_chi_squared    0.096782
freq_mag_3               -0.087552
freq_mag_2               -0.083381
freq_zeros               -0.081326
runs_p_value              0.053026
freq_mag_4               -0.048429
chi_sq                   -0.040180
entropy                   0.029591
block_freqs_mean          0.025591
freq_mag_1               -0.009625
chi_sq_p_value            0.002453
Name: label, dtype: float64
Testing Accuracy For SVM:  1.0
--------------------------------------------------
Reading from:  datasets/mod2sherbrooke.txt datasets/RNG2-2.txt
label                     1.000000
linear_complexity        -0.052204
freq_zeros               -0.022088
block_freqs_