In [1]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import pyarrow.parquet as pq
import warnings
import numpy as np
import warnings
warnings.filterwarnings('ignore')
    

# Continue with the rest of your code

# Specify the file path
file_path = "data_collected_errorstat_wo_noise_1_run=25.parquet"

# Use pyarrow to read the Parquet file
table = pq.read_table(file_path)

# Convert the table to a pandas DataFrame if needed
df = table.to_pandas()

df['ID'] = range(len(df))
# Step 1: Data Preprocessing
X = df[['point_x_shifted_real', 'point_x_shifted_imag']]
y = df['point_label']

# Step 2: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Select a Classifier (SVM in this case)
classifier = SVC(kernel='linear', C=1.0)

# Step 4: Model Training
classifier.fit(X_train, y_train)

# Step 5: Model Evaluation
y_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.9668141592920354


In [2]:
import matplotlib.pyplot as plt
import pandas as pd
all_predictions = classifier.predict(X)
# Create a DataFrame with predicted and true labels
dfc = pd.DataFrame({'point_x_shifted_real': X['point_x_shifted_real'],
                   'point_x_shifted_imag': X['point_x_shifted_imag'],
                   'predicted_point_label': all_predictions,
                   'true_point_label': y})

# Get a list of unique class labels
unique_labels = dfc['true_point_label'].unique()


# Create a scatter plot for each class
for label in unique_labels:
    # Filter the data for the current class
    data_class = dfc[dfc['true_point_label'] == label]
    

In [3]:
from sklearn.neighbors import LocalOutlierFactor

# Define LOF parameters
n_neighbors = 20 # Number of neighbors to consider
contamination = 0.05  # Expected proportion of outliers

# Create a dictionary to store the LOF results for each class
lof_results = {}

# Get a list of unique class labels
unique_labels = dfc['true_point_label'].unique()

# Perform LOF detection for each class
for label in unique_labels:
    # Filter the data for the current class
    data_class = dfc[dfc['true_point_label'] == label]
    
    # Extract the features for LOF detection
    features = data_class[['point_x_shifted_real', 'point_x_shifted_imag']]
    
    # Initialize the LOF model
    lof = LocalOutlierFactor(n_neighbors=n_neighbors, contamination=contamination)
    
    # Fit the LOF model to the data
    lof.fit(features)
    
    # Predict the LOF scores for the data
    lof_scores = lof.negative_outlier_factor_
    
    # Find the IDs of the LOF points
    lof_ids = data_class.index[lof_scores.argsort()[:int(len(lof_scores) * contamination)]]
    
    # Store the results in the dictionary
    lof_results[label] = lof_ids

# Print the LOF IDs for each class



In [4]:
total_lof_ids = sum(len(ids) for ids in lof_results.values())
print("Total number of LOF IDs:", total_lof_ids)


Total number of LOF IDs: 3267


In [5]:
df

Unnamed: 0,minus_m_1_orig_x_real,point_orig_x_real,plus_m_1_orig_x_real,minus_m_1_orig_x_imag,point_orig_x_imag,plus_m_1_orig_x_imag,point_label,point_x_shifted_real,point_x_shifted_imag,ID
0,1.0,-1.0,1.0,1.0,1.0,1.0,5.0,-1.202012,0.879754,0
1,-1.0,1.0,3.0,1.0,1.0,3.0,7.0,0.172561,0.766695,1
2,1.0,3.0,1.0,1.0,3.0,-1.0,2.0,3.430654,3.386711,2
3,3.0,1.0,1.0,3.0,-1.0,3.0,15.0,0.898628,-1.132466,3
4,1.0,1.0,-1.0,-1.0,3.0,3.0,3.0,1.140634,2.445444,4
...,...,...,...,...,...,...,...,...,...,...
65531,3.0,-1.0,-1.0,1.0,3.0,-3.0,1.0,-0.520789,2.869858,65531
65532,-1.0,-1.0,3.0,3.0,-3.0,-1.0,9.0,-0.622440,-3.027317,65532
65533,-1.0,3.0,3.0,-3.0,-1.0,1.0,14.0,3.210507,-1.299906,65533
65534,3.0,3.0,1.0,-1.0,1.0,1.0,6.0,3.362015,1.604537,65534


In [6]:
dfc

Unnamed: 0,point_x_shifted_real,point_x_shifted_imag,predicted_point_label,true_point_label
0,-1.202012,0.879754,5.0,5.0
1,0.172561,0.766695,7.0,7.0
2,3.430654,3.386711,2.0,2.0
3,0.898628,-1.132466,15.0,15.0
4,1.140634,2.445444,3.0,3.0
...,...,...,...,...
65531,-0.520789,2.869858,1.0,1.0
65532,-0.622440,-3.027317,9.0,9.0
65533,3.210507,-1.299906,14.0,14.0
65534,3.362015,1.604537,6.0,6.0


In [7]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Create a new DataFrame for Decision Tree classification
selected_data = pd.DataFrame()

# Combine data for each class based on LOF results
for label, lof_ids in lof_results.items():
    # Select the rows from 'df' for the current class
    class_data = df[df['ID'].isin(lof_ids)]
    selected_data = selected_data.append(class_data, ignore_index=True)

# Define X and y for Decision Tree classification
X_selected = selected_data[['point_x_shifted_real', 'point_x_shifted_imag']]
y_selected = selected_data['point_label']

# Train-Test Split for the selected data
X_train_selected, X_test_selected, y_train_selected, y_test_selected = train_test_split(
    X_selected, y_selected, test_size=0.2, random_state=42)

# Create a Decision Tree classifier
dt_classifier = DecisionTreeClassifier()

# Train the Decision Tree model
dt_classifier.fit(X_train_selected, y_train_selected)

# Make predictions on the test set
y_pred_selected = dt_classifier.predict(X_test_selected)

# Calculate accuracy
accuracy_selected = accuracy_score(y_test_selected, y_pred_selected)
print("Accuracy on selected data:", accuracy_selected)


Accuracy on selected data: 0.6605504587155964


In [8]:
# Predict values using the SVM classifier for the entire dataset
df['Decoded'] = classifier.predict(X)
from sklearn.metrics import accuracy_score

# Calculate accuracy for 'Decoded' and 'point_label' columns
accuracy_decoded = accuracy_score(df['point_label'], df['Decoded'])

print("Accuracy for 'Decoded' column:", accuracy_decoded)


Accuracy for 'Decoded' column: 0.967315673828125


In [9]:
# Iterate through the unique class labels and their LOF IDs
for label, lof_ids in lof_results.items():
    # Select rows with matching 'ID' values
    mask = df['ID'].isin(lof_ids)
    
    # Get the corresponding data
    selected_data = df[mask]
    
    # Predict using the Decision Tree classifier
    predictions = dt_classifier.predict(selected_data[['point_x_shifted_real', 'point_x_shifted_imag']])
    
    # Update the 'Decoded' column for these rows with the Decision Tree predictions
    df.loc[mask, 'Decoded'] = predictions

# Print the updated DataFrame with 'Decoded' values replaced
df

Unnamed: 0,minus_m_1_orig_x_real,point_orig_x_real,plus_m_1_orig_x_real,minus_m_1_orig_x_imag,point_orig_x_imag,plus_m_1_orig_x_imag,point_label,point_x_shifted_real,point_x_shifted_imag,ID,Decoded
0,1.0,-1.0,1.0,1.0,1.0,1.0,5.0,-1.202012,0.879754,0,5.0
1,-1.0,1.0,3.0,1.0,1.0,3.0,7.0,0.172561,0.766695,1,7.0
2,1.0,3.0,1.0,1.0,3.0,-1.0,2.0,3.430654,3.386711,2,2.0
3,3.0,1.0,1.0,3.0,-1.0,3.0,15.0,0.898628,-1.132466,3,15.0
4,1.0,1.0,-1.0,-1.0,3.0,3.0,3.0,1.140634,2.445444,4,3.0
...,...,...,...,...,...,...,...,...,...,...,...
65531,3.0,-1.0,-1.0,1.0,3.0,-3.0,1.0,-0.520789,2.869858,65531,1.0
65532,-1.0,-1.0,3.0,3.0,-3.0,-1.0,9.0,-0.622440,-3.027317,65532,9.0
65533,-1.0,3.0,3.0,-3.0,-1.0,1.0,14.0,3.210507,-1.299906,65533,14.0
65534,3.0,3.0,1.0,-1.0,1.0,1.0,6.0,3.362015,1.604537,65534,6.0


In [10]:
# Predict values using the SVM classifier for the entire dataset
#df['Decoded'] = classifier.predict(X)
#from sklearn.metrics import accuracy_score

# Calculate accuracy for 'Decoded' and 'point_label' columns
accuracy_decoded = accuracy_score(df['point_label'], df['Decoded'])

print("Accuracy for 'Decoded' column:", accuracy_decoded)


Accuracy for 'Decoded' column: 0.985870361328125


In [11]:
import pickle

# Save the trained SVM classifier
with open('svm_classifier.pkl', 'wb') as file:
    pickle.dump(classifier, file)

# Save the trained Decision Tree classifier
with open('decision_tree_classifier.pkl', 'wb') as file:
    pickle.dump(dt_classifier, file)


In [12]:
df

Unnamed: 0,minus_m_1_orig_x_real,point_orig_x_real,plus_m_1_orig_x_real,minus_m_1_orig_x_imag,point_orig_x_imag,plus_m_1_orig_x_imag,point_label,point_x_shifted_real,point_x_shifted_imag,ID,Decoded
0,1.0,-1.0,1.0,1.0,1.0,1.0,5.0,-1.202012,0.879754,0,5.0
1,-1.0,1.0,3.0,1.0,1.0,3.0,7.0,0.172561,0.766695,1,7.0
2,1.0,3.0,1.0,1.0,3.0,-1.0,2.0,3.430654,3.386711,2,2.0
3,3.0,1.0,1.0,3.0,-1.0,3.0,15.0,0.898628,-1.132466,3,15.0
4,1.0,1.0,-1.0,-1.0,3.0,3.0,3.0,1.140634,2.445444,4,3.0
...,...,...,...,...,...,...,...,...,...,...,...
65531,3.0,-1.0,-1.0,1.0,3.0,-3.0,1.0,-0.520789,2.869858,65531,1.0
65532,-1.0,-1.0,3.0,3.0,-3.0,-1.0,9.0,-0.622440,-3.027317,65532,9.0
65533,-1.0,3.0,3.0,-3.0,-1.0,1.0,14.0,3.210507,-1.299906,65533,14.0
65534,3.0,3.0,1.0,-1.0,1.0,1.0,6.0,3.362015,1.604537,65534,6.0
