In [30]:
import pickle
import pyarrow.parquet as pq
import pandas as pd

# Load the trained SVC classifier from the pickle file
with open('svm_classifier.pkl', 'rb') as file:
    loaded_svm_classifier = pickle.load(file)

# Specify the file path for the new dataset
new_file_path = "data_collected_errorstat_wo_noise_1_run=250.parquet"

# Use pyarrow to read the new dataset
new_table = pq.read_table(new_file_path)

# Convert the new table to a pandas DataFrame if needed
new_df = new_table.to_pandas()
new_df['ID'] = range(len(new_df))

# Perform any necessary data preprocessing for the new dataset
# ...

# Extract features from the new dataset (similar to previous code)
X_new = new_df[['point_x_shifted_real', 'point_x_shifted_imag']]

# Use the loaded SVM classifier to make predictions on the new dataset
y_pred_new = loaded_svm_classifier.predict(X_new)



In [31]:
# Predict values using the SVM classifier for the entire dataset
new_df['Decoded'] = y_pred_new


In [32]:
from sklearn.metrics import accuracy_score

accuracy_decoded = accuracy_score(new_df['point_label'], new_df['Decoded'])

print("Accuracy for 'Decoded' column:", accuracy_decoded)

Accuracy for 'Decoded' column: 0.9661102294921875


In [33]:
import matplotlib.pyplot as plt
import pandas as pd

X = new_df[['point_x_shifted_real', 'point_x_shifted_imag']]
y = new_df['point_label']
all_predictions = loaded_svm_classifier.predict(X)
# Create a DataFrame with predicted and true labels
dfc = pd.DataFrame({'point_x_shifted_real': X['point_x_shifted_real'],
                   'point_x_shifted_imag': X['point_x_shifted_imag'],
                   'predicted_point_label': all_predictions,
                   'true_point_label': y})

# Get a list of unique class labels
unique_labels = dfc['true_point_label'].unique()


# Create a scatter plot for each class
for label in unique_labels:
    # Filter the data for the current class
    data_class = dfc[dfc['true_point_label'] == label]
    
# THE LOF    

from sklearn.neighbors import LocalOutlierFactor

# Define LOF parameters
n_neighbors = 20  # Number of neighbors to consider
contamination = 0.05  # Expected proportion of outliers

# Create a dictionary to store the LOF results for each class
lof_results = {}

# Get a list of unique class labels
unique_labels = dfc['true_point_label'].unique()

# Perform LOF detection for each class
for label in unique_labels:
    # Filter the data for the current class
    data_class = dfc[dfc['true_point_label'] == label]
    
    # Extract the features for LOF detection
    features = data_class[['point_x_shifted_real', 'point_x_shifted_imag']]
    
    # Initialize the LOF model
    lof = LocalOutlierFactor(n_neighbors=n_neighbors, contamination=contamination)
    
    # Fit the LOF model to the data
    lof.fit(features)
    
    # Predict the LOF scores for the data
    lof_scores = lof.negative_outlier_factor_
    
    # Find the IDs of the LOF points
    lof_ids = data_class.index[lof_scores.argsort()[:int(len(lof_scores) * contamination)]]
    
    # Store the results in the dictionary
    lof_results[label] = lof_ids


total_lof_ids = sum(len(ids) for ids in lof_results.values())
print("Total number of LOF IDs:", total_lof_ids)



Total number of LOF IDs: 3269


In [34]:
with open('decision_tree_classifier.pkl', 'rb') as file:
    loaded_dt_classifier = pickle.load(file)

selected_data = pd.DataFrame()
  

for label, lof_ids in lof_results.items():
    # Select the rows from 'df' for the current class
    class_data = new_df[new_df['ID'].isin(lof_ids)]
    selected_data = selected_data.append(class_data, ignore_index=True)

# Define X and y for Decision Tree classification
X_selected = selected_data[['point_x_shifted_real', 'point_x_shifted_imag']]
y_selected = selected_data['point_label']

y_pred_selected = loaded_dt_classifier.predict(X_selected)

# Calculate accuracy
accuracy_selected = accuracy_score(y_selected, y_pred_selected)
print("Accuracy on selected data:", accuracy_selected)

Accuracy on selected data: 0.6099724686448456


  selected_data = selected_data.append(class_data, ignore_index=True)


In [35]:
# Iterate through the unique class labels and their LOF IDs
for label, lof_ids in lof_results.items():
    # Select rows with matching 'ID' values
    mask = new_df['ID'].isin(lof_ids)
    
    # Get the corresponding data
    selected_data = new_df[mask]
    
    # Predict using the Decision Tree classifier
    predictions = loaded_dt_classifier.predict(selected_data[['point_x_shifted_real', 'point_x_shifted_imag']])
    
    # Update the 'Decoded' column for these rows with the Decision Tree predictions
    new_df.loc[mask, 'Decoded'] = predictions


In [36]:


# Calculate accuracy for 'Decoded' and 'point_label' columns
accuracy_decoded = accuracy_score(new_df['point_label'], new_df['Decoded'])

print("Accuracy for 'Decoded' column:", accuracy_decoded)


Accuracy for 'Decoded' column: 0.9707794189453125
