In [1]:
pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [3]:
# Importing the required libraries (install in your environment first)
import numpy as np
import pandas as pd
from math import log2, sqrt
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_svmlight_file

In [4]:
# Read in data from your datafile or the provided datafile in this folder to classify QRNG data. 
# Suggested classification strategies include QRNG vs PRNG, QPU vs Simulator, or by individual QPU

#Sample datafile of QRNG (IBM QPUs) vs PRNG data, 12k lines each. Label 1 is QRNG, label 2 is PRNG
data_filePath = 'QRNGvsPRNG_TrainingData.txt'

# Read data file, make dataframe, process labels, and combine/concatenate individual input lines into
# larger input lines to create training and testing datasets to input into gradient boosting model.
# Hint: use train_test_split method from sklearn



df = pd.read_csv(data_filePath, sep=" ", names=['bitstrings', 'Quantum or Classical'])

# Convert 'Quantum or Classical' labels to numeric values (e.g., 1 for QRNG, 2 for PRNG)
# df['Quantum or Classical'] = df['Quantum or Classical'].apply(lambda x: 1 if x == 'QRNG' else 2)

In [5]:
# Example DataFrame (assuming you already have it)
# df['bitstrings'] contains the bitstrings of length 100
# df = pd.DataFrame({'bitstrings': ['1100100100001111110110...', '1010101010101010101010...', ...]})

# Function to calculate Shannon entropy
def shannon_entropy(bitstring):
    # Convert bitstring to a numpy array of integers
    bits = np.array([int(bit) for bit in bitstring])
    
    # Calculate the frequency of 0's and 1's
    counts = np.bincount(bits)
    probabilities = counts / len(bits)
    
    # Filter out zero probabilities to avoid log2(0)
    probabilities = probabilities[probabilities > 0]
    
    # Calculate entropy
    entropy = -np.sum(probabilities * np.log2(probabilities))
    
    return entropy

# Apply the entropy function to each bitstring in the DataFrame
df['entropy'] = df['bitstrings'].apply(shannon_entropy)

# Now the DataFrame 'df' contains an additional 'entropy' column
#print(df[['bitstrings', 'entropy']])


In [20]:
from scipy.stats import chi2
def chi_squared_test(bits):
        counts = np.bincount(list(bits), minlength=2)
        observed = counts
        expected = np.array([len(bits)/2, len(bits)/2])
        chi_sq = np.sum((observed - expected) ** 2 / expected)
        # Degrees of freedom = number of categories - 1 = 1
        p_value = 1 - chi2.cdf(chi_sq, df=1)
        return chi_sq, p_value

# Apply the function and store results in two new columns
df[['chi_sq', 'chi_sq_p_value']] = pd.DataFrame(df['bitstrings'].apply(chi_squared_test).tolist(), index=df.index)

# View the updated DataFrame
print(df[['bitstrings', 'chi_sq', 'chi_sq_p_value']])

                                              bitstrings  chi_sq  \
0      0000101000100111111110011011110111101101010111...    7.84   
1      0100101111010000110010000101001110101001001010...    0.04   
2      1000101010100100011100101111011111001110011101...    1.96   
3      0111101100010110010000011111111001110001100110...    0.36   
4      1111100000011110111111111111101001100100011010...    2.56   
...                                                  ...     ...   
23995  0000100111011010101101011101001111011001011001...    0.36   
23996  1100111110000000100110000101100100111101010011...    0.04   
23997  1001100110110111101011011000101010100111111011...    7.84   
23998  0000010010101011000101010110101000100101001001...    0.64   
23999  1010101001100100100101100111100101010011001010...    0.64   

       chi_sq_p_value  
0            0.005110  
1            0.841481  
2            0.161513  
3            0.548506  
4            0.109599  
...               ...  
23995        0.

In [24]:
from scipy.fft import fft

# Function to get the top 5 dominant magnitudes of frequencies
def dominant_frequencies(bitstring, top_n=5):
    # Convert bitstring to list of integers (0s and 1s)
    bits = np.array([int(bit) for bit in bitstring])
    n = len(bits)  # Length of the bitstring

    # Convert bits to -1 and 1 for FFT
    signal = 2 * bits - 1
    fft_result = fft(signal)
    
    # Only take the positive frequencies
    freqs = np.fft.fftfreq(n)
    magnitudes = np.abs(fft_result)
    positive_freqs = freqs[:n//2]
    positive_magnitudes = magnitudes[:n//2]
    
    # Find the top_n dominant frequencies
    indices = np.argsort(positive_magnitudes)[-top_n:]
    dominant_mags = positive_magnitudes[indices]
    
    # Sort the dominant magnitudes in descending order
    sorted_mags = np.sort(dominant_mags)[::-1]
    
    # If fewer than top_n magnitudes, pad with NaNs
    if len(sorted_mags) < top_n:
        sorted_mags = np.pad(sorted_mags, (0, top_n - len(sorted_mags)), constant_values=np.nan)
    
    return sorted_mags

# Assuming df['bitstrings'] contains the bitstrings of length 100
# Apply the function and expand it into separate columns
df[['freq_mag_1', 'freq_mag_2', 'freq_mag_3', 'freq_mag_4', 'freq_mag_5']] = pd.DataFrame(
    df['bitstrings'].apply(dominant_frequencies).tolist(), index=df.index
)

# View the updated DataFrame with the new columns
print(df[['bitstrings', 'freq_mag_1', 'freq_mag_2', 'freq_mag_3', 'freq_mag_4', 'freq_mag_5']])


                                              bitstrings  freq_mag_1  \
0      0000101000100111111110011011110111101101010111...   28.000000   
1      0100101111010000110010000101001110101001001010...   17.189026   
2      1000101010100100011100101111011111001110011101...   21.908773   
3      0111101100010110010000011111111001110001100110...   22.221626   
4      1111100000011110111111111111101001100100011010...   21.651404   
...                                                  ...         ...   
23995  0000100111011010101101011101001111011001011001...   21.404946   
23996  1100111110000000100110000101100100111101010011...   18.405510   
23997  1001100110110111101011011000101010100111111011...   28.000000   
23998  0000010010101011000101010110101000100101001001...   22.435155   
23999  1010101001100100100101100111100101010011001010...   23.954548   

       freq_mag_2  freq_mag_3  freq_mag_4  freq_mag_5  
0       19.921323   16.173097   15.578477   15.174516  
1       16.956262   16.

In [27]:
def autocorrelation(bitstring, lag=1):
    # Convert the bitstring to a numpy array of integers (0 and 1)
    bits = np.array([int(bit) for bit in bitstring])
    
    if lag >= len(bits):
        raise ValueError("Lag is too large for the bitstream length.")
    
    # Shift the bits by the given lag
    shifted = np.roll(bits, -lag)
    
    # Calculate correlation excluding the wrapped-around elements
    valid_length = len(bits) - lag
    correlation = np.corrcoef(bits[:valid_length], shifted[:valid_length])[0, 1]
    
    return correlation

df['autocorrelation_lag1'] = df['bitstrings'].apply(autocorrelation)

In [52]:
!pip install sp80022suite

Collecting sp80022suite
  Using cached sp80022suite-0.0.8.tar.gz (1.0 MB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: sp80022suite
  Building wheel for sp80022suite (setup.py) ... [?25ldone
[?25h  Created wheel for sp80022suite: filename=sp80022suite-0.0.8-cp311-cp311-linux_x86_64.whl size=1052852 sha256=a0c8708e21851811a6505cc8da6bd352d10a3f7c82d798badff55ded75ee425f
  Stored in directory: /home/jovyan/.cache/pip/wheels/28/5d/0b/fe2a1c641ced7cd3726ced2673263f0f3541f2f23ba6dc45ae
Successfully built sp80022suite
Installing collected packages: sp80022suite
Successfully installed sp80022suite-0.0.8


In [58]:
def frequency_test(bitstring):
    """
    Perform the Frequency Test on a bitstring.
    
    :param bitstring: A 1D numpy array or list of bits (0s and 1s).
    :return: A tuple containing the number of 1s, number of 0s, and the test statistic.
    """
    count_ones = list(bitstring).count('1')
    count_zeros = len(bitstring) - count_ones
    
    return count_ones, count_zeros

def block_frequency_test(bitstring, M):
    """
    Perform the Block Frequency Test on a bitstring.
    
    :param bitstring: A 1D numpy array or list of bits (0s and 1s).
    :param M: Block size for the block frequency test.
    :return: The frequencies of 1s in each block and the test statistic.
    """
    bitstring = [int(c) for c in bitstring]
    n = len(bitstring)
    num_blocks = n // M
    block_frequencies = np.zeros(num_blocks)

    # Compute block frequencies
    for i in range(num_blocks):
        block = bitstring[i * M:(i + 1) * M]
        block_frequencies[i] = np.sum(block)

    # Calculate the mean and variance of the block frequencies
    mean_frequency = np.mean(block_frequencies)
    variance_frequency = np.var(block_frequencies)

    # The expected mean and variance for a random sequence
    expected_mean = M / 2
    expected_variance = M / 4

    # Chi-squared statistic
    chi_squared = ((mean_frequency - expected_mean) ** 2 / expected_variance) + \
                  (variance_frequency / expected_variance)

    return block_frequencies.max(), block_frequencies.mean(), chi_squared

df['freq_ones'], df['freq_zeros'] = zip(*df['bitstrings'].apply(frequency_test))

# Set block size for Block Frequency Test
M = 10  # Example block size
df['block_freqs_max'], df['block_freqs_mean'], df['block_freq_chi_squared'] = zip(*df['bitstrings'].apply(lambda x: block_frequency_test(x, M)))

# Display the updated DataFrame
print(df[['bitstrings', 'freq_ones', 'freq_zeros', 'block_freqs_max', 'block_freqs_mean', 'block_freq_chi_squared']])


                                              bitstrings  freq_ones  \
0      0000101000100111111110011011110111101101010111...         64   
1      0100101111010000110010000101001110101001001010...         49   
2      1000101010100100011100101111011111001110011101...         57   
3      0111101100010110010000011111111001110001100110...         53   
4      1111100000011110111111111111101001100100011010...         58   
...                                                  ...        ...   
23995  0000100111011010101101011101001111011001011001...         47   
23996  1100111110000000100110000101100100111101010011...         49   
23997  1001100110110111101011011000101010100111111011...         64   
23998  0000010010101011000101010110101000100101001001...         46   
23999  1010101001100100100101100111100101010011001010...         46   

       freq_zeros  block_freqs_max  block_freqs_mean  block_freq_chi_squared  
0              36              9.0               6.4                

In [66]:
from scipy.stats import binom

def runs_test(bits):
    """
    Performs the Runs Test on a binary sequence.
    
    Parameters:
    bits (array-like): A binary sequence (1s and 0s).
    
    Returns:
    (int, float): The number of runs and the p-value of the test.
    """
    # Convert bits to a numpy array
    bits = np.array([int(c) for c in bits])
    
    # Count the number of runs
    runs = 1  # Start with the first run
    for i in range(1, len(bits)):
        if bits[i] != bits[i-1]:
            runs += 1
            
    n1 = np.sum(bits)  # Number of 1s
    n0 = len(bits) - n1  # Number of 0s
    
    # Calculate the expected number of runs and variance
    expected_runs = (2 * n1 * n0) / (n1 + n0) + 1
    variance_runs = (2 * n1 * n0 * (2 * n1 * n0 - n1 - n0)) / ((n1 + n0) ** 2 * (n1 + n0 - 1))
    
    # Z-score
    z = (runs - expected_runs) / np.sqrt(variance_runs)
    
    # Calculate p-value from z-score
    p_value = 1 - binom.cdf(runs, n=n1 + n0, p=0.5)  # Note: binom.cdf is not directly usable for z-scores
    
    return runs, p_value

df['runs'], df['runs_p_value'] = zip(*df['bitstrings'].apply(runs_test))
print(df[['runs', 'runs_p_value']])


       runs  runs_p_value
0        46      0.757941
1        52      0.308650
2        51      0.382177
3        49      0.539795
4        42      0.933395
...     ...           ...
23995    55      0.135627
23996    49      0.539795
23997    49      0.539795
23998    59      0.028444
23999    63      0.003319

[24000 rows x 2 columns]


In [72]:
def xor(a, b):
    """
    Returns the XOR of two binary values (0 or 1).
    
    Parameters:
    a (int): First binary value (0 or 1).
    b (int): Second binary value (0 or 1).
    
    Returns:
    int: Result of a XOR b (0 or 1).
    """
    return (a or b) and not (a and b)

def linear_complexity(bits):
    """
    Computes the Linear Complexity of a binary sequence using the Berlekamp-Massey algorithm.
    
    Parameters:
    bits (array-like): A binary sequence (1s and 0s).
    
    Returns:
    int: The linear complexity of the sequence.
    """
    bits = np.array([int(c) for c in bits])
    n = len(bits)
    l = 0  # Linear complexity
    m = -1  # Previous index where the error occurred
    C = np.zeros(n)
    B = np.zeros(n)
    C[0] = 1  # The polynomial is initialized with a leading coefficient of 1
    B[0] = 1
    
    for n in range(n):
        # Calculate the discrepancy
        discrepancy = bits[n]
        for i in range(1, l + 1):
            discrepancy = xor(discrepancy, (C[i] * bits[n - i]))
        
        if discrepancy == 1:  # An error occurred
            T = C.copy()
            for i in range(n - m):
                if m + i < n:
                    C[m + i] = xor(C[m + i], B[i])
            if l <= n // 2:
                l = n + 1 - l
                m = n
                B = T.copy()
    
    return l

df['linear_complexity'] = df['bitstrings'].apply(linear_complexity)
print(df['linear_complexity'])

0        50
1        50
2        51
3        51
4        51
         ..
23995    50
23996    49
23997    50
23998    52
23999    50
Name: linear_complexity, Length: 24000, dtype: int64


In [77]:
# Normalize a specific column in the DataFrame (let's assume 'column_name' is the one you want to normalize)
def zscore_col(df, colname):
    df[colname] = (df[colname] - df[colname].mean()) / df[colname].std()

metrics = ['linear_complexity', 'runs_p_value', 'runs', 'chi_sq','chi_sq_p_value','entropy', 'freq_ones', 'freq_zeros', 'block_freqs_max', 'block_freqs_mean', 'block_freq_chi_squared']



for colname in metrics:
    zscore_col(df, colname)

# Calculate correlation
correlation_matrix = df[metrics + ['Quantum or Classical']].corr()
correlation_with_target = correlation_matrix['Quantum or Classical'].sort_values(ascending=False, key=abs)
print(correlation_with_target)


Quantum or Classical      1.000000
entropy                   0.043439
chi_sq                   -0.043405
freq_zeros                0.038105
freq_ones                -0.038105
block_freqs_mean         -0.038105
chi_sq_p_value            0.032646
block_freqs_max          -0.022674
block_freq_chi_squared   -0.021062
runs_p_value             -0.005076
runs                      0.004439
linear_complexity         0.002260
Name: Quantum or Classical, dtype: float64


In [83]:
# Split the 'strings' column into separate columns for each character
df_split = df['bitstrings'].str.split('', expand=True)

# Fill NaN values with an empty string (optional)
df_split.fillna('', inplace=True)

# Concatenate the split DataFrame with the original DataFrame
df_combined = pd.concat([df, df_split], axis=1)

In [87]:
# Prepare input features (X) and target labels (y)
columns_to_exclude = ['bitstrings', 'Quantum or Classical']
df_data = df.drop(columns=columns_to_exclude)
X = pd.DataFrame(df[metrics])
X = np.array(X)
y = np.array(df['Quantum or Classical'])
y = y - 1
# print(X)
# print(sorted(y))
# print(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
print(X_train)
print(y_train)

[[ 0.7613944  -0.9206176   0.71592708 ...  0.65513071 -0.62348465
   0.15281114]
 [-0.20699275 -0.23061921  0.11635803 ... -0.45314709 -0.23250347
  -1.34266465]
 [ 0.7613944  -0.23061921  0.11635803 ... -0.45314709  0.1584777
  -1.15573018]
 ...
 [-2.14376706  1.68955127 -1.6823491  ... -0.45314709 -0.42799406
  -0.87532847]
 [ 2.69816871  0.31844351 -0.28335467 ...  0.65513071  0.54945888
   0.15281114]
 [-1.17537991  0.84638844 -0.68306736 ... -0.45314709 -0.23250347
   0.52668009]]
[0 1 1 ... 0 1 1]


In [88]:
# A skeleton for running your training dataframe through a SKLearn gradient boosting model. 
# You can also use other ML frameworks such as Pytorch, XGBoost, etc

from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb

# Create the Gradient Boosting classifier
xgb_model = xgb.XGBClassifier(random_state=93429834)

# Train the model. Define X_train and y_train from your training dataframe using 
# sklearns train_test_split method
xgb_model.fit(X_train, y_train, eval_set=[(X_test, y_test)])

y1 = xgb_model.predict(X_train)
accuracy = accuracy_score(y_train, y1)
print("Training accuracy: ", accuracy)

# Make predictions on the test set
y_pred_gb = xgb_model.predict(X_test)

# Calculate the accuracy of the Gradient Boosting model
accuracy_gb = accuracy_score(y_test, y_pred_gb)
print("Gradient Boosting Accuracy: ", accuracy_gb)

[0]	validation_0-logloss:0.69239
[1]	validation_0-logloss:0.69274
[2]	validation_0-logloss:0.69296
[3]	validation_0-logloss:0.69305
[4]	validation_0-logloss:0.69373
[5]	validation_0-logloss:0.69354
[6]	validation_0-logloss:0.69383
[7]	validation_0-logloss:0.69423
[8]	validation_0-logloss:0.69480
[9]	validation_0-logloss:0.69484
[10]	validation_0-logloss:0.69494
[11]	validation_0-logloss:0.69504
[12]	validation_0-logloss:0.69548
[13]	validation_0-logloss:0.69594
[14]	validation_0-logloss:0.69618
[15]	validation_0-logloss:0.69633
[16]	validation_0-logloss:0.69698
[17]	validation_0-logloss:0.69712
[18]	validation_0-logloss:0.69734
[19]	validation_0-logloss:0.69772
[20]	validation_0-logloss:0.69740
[21]	validation_0-logloss:0.69772
[22]	validation_0-logloss:0.69795
[23]	validation_0-logloss:0.69845
[24]	validation_0-logloss:0.69869
[25]	validation_0-logloss:0.69876
[26]	validation_0-logloss:0.69847
[27]	validation_0-logloss:0.69871
[28]	validation_0-logloss:0.69906
[29]	validation_0-loglos

In [126]:
# A skeleton for running your training dataframe through a SKLearn gradient boosting model. 
# You can also use other ML frameworks such as Pytorch, XGBoost, etc

# Create the Gradient Boosting classifier
#gb_model = GradientBoostingClassifier(random_state=42, subsample=0.8, n_estimators=100, max_depth=11, loss='exponential', min_samples_leaf=2, learning_rate=0.001)
gb_model = GradientBoostingClassifier(random_state=42, subsample=0.8, n_estimators=100, max_depth=11, loss='log_loss', min_samples_leaf=2, learning_rate=0.001)

# Train the model. Define X_train and y_train from your training dataframe using 
# sklearns train_test_split method
gb_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_gb = gb_model.predict(X_test)

y1 = gb_model.predict(X_train)
accuracy = accuracy_score(y_train, y1)
print("Training accuracy: ", accuracy)

# Calculate the accuracy of the Gradient Boosting model
accuracy_gb = accuracy_score(y_test, y_pred_gb)
print("Gradient Boosting Accuracy: ", accuracy_gb)

Training accuracy:  0.6563888888888889
Gradient Boosting Accuracy:  0.5218333333333334
