#README
In order to use this code you need to use google colab. The project specifies that undergraduate students use datasets 1-5 for classification and datasets 1 and 2 for missing value estimation. Additionally the project mentions that dataset 3 for missing value estimation is a bonus problem for undergraduate students.

Once that is understood then you can move on to the next step.

The next step is as simple as dragging and dropping the following files into google colabs file section located on the left side of the page appearing as a file icon:
- TestData1.txt
- TestData2.txt
- TestData3.txt
- TestData4.txt
- TestData5.txt
- TrainData1.txt
- TrainData2.txt
- TrainData3.txt
- TrainData4.txt
- TrainData5.txt
- TrainLabel1.txt
- TrainLabel2.txt
- TrainLabel3.txt
- TrainLabel4.txt
- TrainLabel5.txt
- MissingData1.txt
- MissingData2.txt
- MissingData3.txt

Once that is completed go to the Runtime tab at the top of the page and select "Change runtime type", under "Hardware Accelerator" select "TPU" and then "Save".

Finally, go to the Runtime tab once again and select "Run all" to run the code.

Once the code has finished running you'll see the results outputted in the same Files section with their respective names and numbers. The format will look as such:

Imputed_KNN_MissingDataX.txt
Imputed_Mean_MissingDataX.txt
TestResult_RF_DatasetX.txt
TestResult_SVM_DatasetX.txt

If you want to download these files you will need to select them individually. Start by hovering over the file you want to download and select the three vertical dots that appear on the right of the file name. The download option will appear, select it and save it onto your desktop.

### These Are All the Imports Used Throughout the Code

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.svm import SVC
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold


#The Following is Part 1: Classification

##Classification Using Random Forest

###The Following is the Imputation, Training, and Prediction Results of TrainData1.txt, TrainLabel1.txt, and TestData1.txt
1 uses mean imputation

In [None]:
# Load Training Data, Training Label, and Testing Label
training_data1 = pd.read_csv("TrainData1.txt", delimiter='\t', header=None)
training_label1 = pd.read_csv("TrainLabel1.txt", header=None)
testing_data1 = pd.read_csv("TestData1.txt", delimiter='\t', header=None)

# Replacing the very inconvenient 1.00000000000000e+99 with a better, more convenient NaN
training_data1.replace(1.00000000000000e+99, np.nan, inplace=True)
testing_data1.replace(1.00000000000000e+99, np.nan, inplace=True)

# The RandomForestClassifier doesn't take NaN values so the values are imputed
# by taking the mean, the value remains equally as meaningless as NaN
imputer = SimpleImputer(strategy='mean')
training_data1_imputed = pd.DataFrame(imputer.fit_transform(training_data1), columns=training_data1.columns)
testing_data1_imputed = pd.DataFrame(imputer.transform(testing_data1), columns=training_data1.columns)

# Use StratifiedKFold for cross-validation
kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# Classification Model (Random Forest)
clf = RandomForestClassifier()

# Perform cross-validation
scores1 = cross_val_score(clf, training_data1_imputed, training_label1.values.ravel(), cv=kfold, scoring='accuracy')
print(f"Cross-Validation Scores: {scores1}")
print(f"Mean accuracy: {np.mean(scores1)}")

# Train the model on the entire training set
clf.fit(training_data1_imputed, training_label1.values.ravel())

# Test Label Prediction
y_testing_prediction = clf.predict(testing_data1_imputed)

# Output Results
pd.DataFrame(y_testing_prediction).to_csv("TestResult_RF_Dataset1.txt", index=False, header=None)
dataset_name = "Dataset1"
output_file_path = f"TestResult_RF_{dataset_name}.txt"
print(f"Test results for {dataset_name} exported to {output_file_path}")


Cross-Validation Scores: [0.92 0.88 0.94]
Mean accuracy: 0.9133333333333334
Test results for Dataset1 exported to TestResult_RF_Dataset1.txt


###The following is the Imputation, Training, and Prediction Results of TrainData2, TrainLabel2, and TestData2
2 uses KNN imputation

In [None]:
def process_dataset2(data):
    # Convert data to numeric format to avoid TypeError
    data = data.applymap(lambda x: pd.to_numeric(x, errors='coerce'))

    # Using KNN Imputation
    imputer = KNNImputer(n_neighbors=3)  # You can adjust the number of neighbors as needed
    data_imputed = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

    return data_imputed

def train_and_predict2(dataset_name, train_data_file, train_label_file, test_data_file):
    # Load the training data
    training_data = pd.read_csv(train_data_file, delimiter='\s+', header=None, engine='python')
    training_labels = pd.read_csv(train_label_file, header=None, engine='python').values.ravel()

    # Load the testing data
    testing_data = pd.read_csv(test_data_file, delimiter='\s+', header=None)

    # Process training and testing data with KNN imputation
    training_data_imputed = process_dataset2(training_data)
    testing_data_imputed = process_dataset2(testing_data)

    # Classification Model (Random Forest)
    clf = RandomForestClassifier()

    # Use StratifiedKFold for cross-validation
    kfold = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)

    # Perform cross-validation on the entire training set
    scores245 = cross_val_score(clf, training_data_imputed, training_labels, cv=kfold, scoring='accuracy')

    # Print the accuracy for each fold
    print(f"Cross-Validation Scores for {dataset_name}: {scores245}")
    print(f"Mean accuracy: {np.mean(scores245)}")

    # Fit the model on the entire training set
    clf.fit(training_data_imputed, training_labels)

    # Test Label Prediction
    y_testing_prediction = clf.predict(testing_data_imputed)

    # Output Results to a Text File
    np.savetxt(f"TestResult_RF_{dataset_name}.txt", y_testing_prediction, fmt='%d')
    output_file_path = f"TestResult_RF_{dataset_name}.txt"
    print(f"Test results for {dataset_name} exported to {output_file_path}")
    print()

# Call the function for each dataset
train_and_predict2("Dataset2", "TrainData2.txt", "TrainLabel2.txt", "TestData2.txt")


Cross-Validation Scores for Dataset2: [0.96 0.8  0.84 0.84]
Mean accuracy: 0.86
Test results for Dataset2 exported to TestResult_RF_Dataset2.txt



###The Following is the following is the Imputation, Training, and Prediction Results of TrainData3.txt, TrainLabel3.txt, and TestData3.txt
3 uses mean imputation

In [None]:
def train_and_predict3(dataset_name, train_data_path, train_label_path, test_data_path):
    # Loading Training Data, Training Labels, and Test Data
    train_data3 = pd.read_csv(train_data_path, delimiter='\t', header=None)
    train_labels3 = pd.read_csv(train_label_path, header=None)
    test_data3 = pd.read_csv(test_data_path, delimiter=',', header=None)

    # Replace missing values with NaN in the training data
    train_data3.replace(1.00000000000000e+99, np.nan, inplace=True)
    test_data3.replace(1000000000, np.nan, inplace=True)

    # Impute missing values with KNN
    imputer = KNNImputer(n_neighbors=3)  # You can adjust the number of neighbors as needed
    training_data3_imputed = pd.DataFrame(imputer.fit_transform(train_data3), columns=train_data3.columns)
    test_data3_imputed = pd.DataFrame(imputer.transform(test_data3), columns=test_data3.columns)

    # Use StratifiedKFold for cross-validation
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # Initialize RandomForestClassifier
    clf = RandomForestClassifier()

    # Print cross-validation scores
    scores3 = cross_val_score(clf, training_data3_imputed, train_labels3.values.ravel(), cv=kfold, scoring='accuracy')
    print(f"Cross-Validation Scores for {dataset_name}: {scores3}")
    print(f"Mean accuracy: {np.mean(scores3)}")

    # Train the model on the entire training set
    clf.fit(training_data3_imputed, train_labels3.values.ravel())

    # Predict labels for the testing data
    y_testing_prediction = clf.predict(test_data3_imputed)

    # Export test results to a text file
    output_file_path = f"TestResult_RF_{dataset_name}.txt"
    pd.DataFrame(y_testing_prediction).to_csv(output_file_path, index=False, header=None)

    print(f"Test results for {dataset_name} exported to {output_file_path}")

# Example usage for Dataset3 with test results and cross-validation scores
train_and_predict3("Dataset3", "TrainData3.txt", "TrainLabel3.txt", "TestData3.txt")


Cross-Validation Scores for Dataset3: [0.30793651 0.31269841 0.31984127 0.32301587 0.32460317]
Mean accuracy: 0.3176190476190476
Test results for Dataset3 exported to TestResult_RF_Dataset3.txt


###The following is the Imputation, Training, and Prediction Results of TrainData, TrainLabel, and Testdata of 4 and 5.
4 and 5 use mean imputation

In [None]:
def process_dataset45(data):
    # Convert data to numeric format to avoid TypeError
    data = data.applymap(lambda x: pd.to_numeric(x, errors='coerce'))

    # Using Mean Imputation
    imputer = SimpleImputer(strategy='mean')
    data_imputed = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

    return data_imputed

def train_and_predict45(dataset_name, train_data_file, train_label_file, test_data_file):
    # Load the training data
    training_data = pd.read_csv(train_data_file, delimiter='\s+', header=None, engine='python')
    training_labels = pd.read_csv(train_label_file, header=None, engine='python').values.ravel()

    # Load the testing data
    testing_data = pd.read_csv(test_data_file, delimiter='\s+', header=None)

    # Process training and testing data
    training_data_imputed = process_dataset45(training_data)
    testing_data_imputed = process_dataset45(testing_data)

    # Classification Model (Random Forest)
    clf = RandomForestClassifier()

    # Use StratifiedKFold for cross-validation
    kfold = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)

    # Perform cross-validation on the entire training set
    scores245 = cross_val_score(clf, training_data_imputed, training_labels, cv=kfold, scoring='accuracy')

    # Print the accuracy for each fold
    print(f"Cross-Validation Scores for {dataset_name}: {scores245}")
    print(f"Mean accuracy: {np.mean(scores245)}")

    # Fit the model on the entire training set
    clf.fit(training_data_imputed, training_labels)

    # Test Label Prediction
    y_testing_prediction = clf.predict(testing_data_imputed)

    # Output Results to a Text File
    np.savetxt(f"TestResult_RF_{dataset_name}.txt", y_testing_prediction, fmt='%d')
    output_file_path = f"TestResult_RF_{dataset_name}.txt"
    print(f"Test results for {dataset_name} exported to {output_file_path}")
    print()

train_and_predict45("Dataset4", "TrainData4.txt", "TrainLabel4.txt", "TestData4.txt")
train_and_predict45("Dataset5", "TrainData5.txt", "TrainLabel5.txt", "TestData5.txt")


Cross-Validation Scores for Dataset4: [0.96546311 0.94976452 0.95761381 0.97484277]
Mean accuracy: 0.9619210528914033
Test results for Dataset4 exported to TestResult_RF_Dataset4.txt

Cross-Validation Scores for Dataset5: [0.58928571 0.69642857 0.65357143 0.65232975]
Mean accuracy: 0.6479038658474142
Test results for Dataset5 exported to TestResult_RF_Dataset5.txt



##Classification Using SVM (Support Vector Machine)

###The Following is the Imputation, Training, and Prediction Results of TrainData1.txt, TrainLabel1.txt, and TestData1.txt

In [None]:
# Load Training Data, Training Label, and Testing Label
training_data1 = pd.read_csv("TrainData1.txt", delimiter='\t', header=None)
training_label1 = pd.read_csv("TrainLabel1.txt", header=None)
testing_data1 = pd.read_csv("TestData1.txt", delimiter='\t', header=None)

# Replacing the very inconvenient 1.00000000000000e+99 with a better, more convenient NaN
training_data1.replace(1.00000000000000e+99, np.nan, inplace=True)
testing_data1.replace(1.00000000000000e+99, np.nan, inplace=True)

# The SVM classifier is sensitive to feature scales, so it's a good idea to standardize the data
scaler = StandardScaler()
training_data1_scaled = pd.DataFrame(scaler.fit_transform(training_data1), columns=training_data1.columns)
testing_data1_scaled = pd.DataFrame(scaler.transform(testing_data1), columns=training_data1.columns)

# The RandomForestClassifier doesn't take NaN values, so the values are imputed
# by taking the mean, the value remains equally as meaningless as NaN
imputer = SimpleImputer(strategy='mean')
training_data1_imputed = pd.DataFrame(imputer.fit_transform(training_data1_scaled), columns=training_data1_scaled.columns)
testing_data1_imputed = pd.DataFrame(imputer.transform(testing_data1_scaled), columns=training_data1_scaled.columns)

# Use StratifiedKFold for cross-validation
kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# Classification Model (Support Vector Machine)
clf = SVC()

# Perform cross-validation
scores1 = cross_val_score(clf, training_data1_imputed, training_label1.values.ravel(), cv=kfold, scoring='accuracy')
print(f"Cross-Validation Scores: {scores1}")
print(f"Mean accuracy: {np.mean(scores1)}")

# Train the model on the entire training set
clf.fit(training_data1_imputed, training_label1.values.ravel())

# Test Label Prediction
y_testing_prediction = clf.predict(testing_data1_imputed)

# Output Results
pd.DataFrame(y_testing_prediction).to_csv("TestResult_SVM_Dataset1.txt", index=False, header=None)
dataset_name = "Dataset1"
output_file_path = f"TestResult_SVM_{dataset_name}.txt"
print(f"Test results for {dataset_name} exported to {output_file_path}")


Cross-Validation Scores: [0.92 0.86 0.9 ]
Mean accuracy: 0.8933333333333334
Test results for Dataset1 exported to TestResult_SVM_Dataset1.txt


###The following is the Imputation, Training, and Prediction Results of TrainData, TrainLabel, and Testdata of 2, 4, and 5.*italicized text*

In [None]:
def process_dataset245(data):
    # Convert data to numeric format to avoid TypeError
    data = data.applymap(lambda x: pd.to_numeric(x, errors='coerce'))

    # Using Mean Imputation
    imputer = SimpleImputer(strategy='mean')
    data_imputed = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

    return data_imputed

def train_and_predict245(dataset_name, train_data_file, train_label_file, test_data_file):
    # Load the training data
    training_data = pd.read_csv(train_data_file, delimiter='\s+', header=None, engine='python')
    training_labels = pd.read_csv(train_label_file, header=None, engine='python').values.ravel()

    # Load the testing data
    testing_data = pd.read_csv(test_data_file, delimiter='\s+', header=None)

    # Process training and testing data
    training_data_imputed = process_dataset245(training_data)
    testing_data_imputed = process_dataset245(testing_data)

    # The SVM classifier is sensitive to feature scales, so it's a good idea to standardize the data
    scaler = StandardScaler()
    training_data_scaled = pd.DataFrame(scaler.fit_transform(training_data_imputed), columns=training_data_imputed.columns)
    testing_data_scaled = pd.DataFrame(scaler.transform(testing_data_imputed), columns=training_data_imputed.columns)

    # Classification Model (Support Vector Machine)
    clf = SVC()

    # Use StratifiedKFold for cross-validation
    kfold = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)

    # Perform cross-validation on the entire training set
    scores245 = cross_val_score(clf, training_data_scaled, training_labels, cv=kfold, scoring='accuracy')

    # Print the accuracy for each fold
    print(f"Cross-Validation Scores for {dataset_name}: {scores245}")
    print(f"Mean accuracy: {np.mean(scores245)}")

    # Fit the model on the entire training set
    clf.fit(training_data_scaled, training_labels)

    # Test Label Prediction
    y_testing_prediction = clf.predict(testing_data_scaled)

    # Output Results to a Text File
    np.savetxt(f"TestResult_SVM_{dataset_name}.txt", y_testing_prediction, fmt='%d')
    output_file_path = f"TestResult_SVM_{dataset_name}.txt"
    print(f"Test results for {dataset_name} exported to {output_file_path}")
    print()

# Call the function for each dataset
train_and_predict245("Dataset2", "TrainData2.txt", "TrainLabel2.txt", "TestData2.txt")
train_and_predict245("Dataset4", "TrainData4.txt", "TrainLabel4.txt", "TestData4.txt")
train_and_predict245("Dataset5", "TrainData5.txt", "TrainLabel5.txt", "TestData5.txt")


Cross-Validation Scores for Dataset2: [0.88 0.84 0.76 0.84]
Mean accuracy: 0.83
Test results for Dataset2 exported to TestResult_SVM_Dataset2.txt

Cross-Validation Scores for Dataset4: [0.89638932 0.89638932 0.90423862 0.91823899]
Mean accuracy: 0.9038140655391329
Test results for Dataset4 exported to TestResult_SVM_Dataset4.txt

Cross-Validation Scores for Dataset5: [0.575      0.62857143 0.60714286 0.59498208]
Mean accuracy: 0.601424091141833
Test results for Dataset5 exported to TestResult_SVM_Dataset5.txt



###The Following is the following is the Imputation, Training, and Prediction Results of TrainData3.txt, TrainLabel3.txt, and TestData3.txt

In [None]:
def train_and_predict3(dataset_name, train_data_path, train_label_path, test_data_path):
    # Loading Training Data, Training Labels, and Test Data
    train_data3 = pd.read_csv(train_data_path, delimiter='\t', header=None)
    train_labels3 = pd.read_csv(train_label_path, header=None)
    test_data3 = pd.read_csv(test_data_path, delimiter=',', header=None)

    # Replace missing values with NaN in the training data
    train_data3.replace(1.00000000000000e+99, np.nan, inplace=True)
    test_data3.replace(1000000000, np.nan, inplace=True)

    # Impute missing values with KNN
    imputer = KNNImputer(n_neighbors=3)  # You can adjust the number of neighbors as needed
    training_data3_imputed = pd.DataFrame(imputer.fit_transform(train_data3), columns=train_data3.columns)
    test_data3_imputed = pd.DataFrame(imputer.transform(test_data3), columns=test_data3.columns)

    # The SVM classifier is sensitive to feature scales, so it's a good idea to standardize the data
    scaler = StandardScaler()
    training_data3_scaled = pd.DataFrame(scaler.fit_transform(training_data3_imputed), columns=training_data3_imputed.columns)
    test_data3_scaled = pd.DataFrame(scaler.transform(test_data3_imputed), columns=training_data3_imputed.columns)

    # Use StratifiedKFold for cross-validation
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # Initialize Support Vector Machine Classifier
    clf = SVC()

    # Print cross-validation scores
    scores3 = cross_val_score(clf, training_data3_scaled, train_labels3.values.ravel(), cv=kfold, scoring='accuracy')
    print(f"Cross-Validation Scores for {dataset_name}: {scores3}")
    print(f"Mean accuracy: {np.mean(scores3)}")

    # Train the model on the entire training set
    clf.fit(training_data3_scaled, train_labels3.values.ravel())

    # Predict labels for the testing data
    y_testing_prediction = clf.predict(test_data3_scaled)

    # Export test results to a text file
    output_file_path = f"TestResult_SVM_{dataset_name}.txt"
    pd.DataFrame(y_testing_prediction).to_csv(output_file_path, index=False, header=None)

    print(f"Test results for {dataset_name} exported to {output_file_path}")

# Example usage for Dataset3 with test results and cross-validation scores
train_and_predict3("Dataset3", "TrainData3.txt", "TrainLabel3.txt", "TestData3.txt")


Cross-Validation Scores for Dataset3: [0.32936508 0.32857143 0.35079365 0.34365079 0.33809524]
Mean accuracy: 0.3380952380952381
Test results for Dataset3 exported to TestResult_SVM_Dataset3.txt


#The Following is the Beginning of Part 2: Missing Value Estimation

##The Following is the Missing Value Estimation of Dataset1, 2, and 3

### First Method is the Previously Used Mean Imputation

In [None]:
def mean_imputation(file_path):
    # Read data from the file
    dataset = pd.read_csv(file_path, delimiter='\s+', header=None)

    # Replace 1.00000000000000e+99 with NaN for easier handling
    dataset.replace(1.00000000000000e+99, np.nan, inplace=True)

    # Impute missing values with the mean of each column
    imputer = SimpleImputer(strategy='mean')
    dataset_imputed = pd.DataFrame(imputer.fit_transform(dataset), columns=dataset.columns)

    return dataset_imputed

# Imputing Dataset 1
file_path_dataset1 = "MissingData1.txt"
dataset1_imputed = mean_imputation(file_path_dataset1)
print("Imputed Dataset 1:")
print(dataset1_imputed)
print()


# Imputing Dataset 2
file_path_dataset2 = "MissingData2.txt"  # Replace with the actual file path
dataset2_imputed = mean_imputation(file_path_dataset2)
print("Imputed Dataset 2:")
print(dataset2_imputed)
print()


# Imputing Dataset 3
file_path_dataset3 = "MissingData3.txt"  # Replace with the actual file path
dataset3_imputed = mean_imputation(file_path_dataset3)
print("Imputed Dataset 3:")
print(dataset3_imputed)
print()


# Saving Imputed Datasets
dataset1_imputed.to_csv("Imputed_Mean_MissingData1.txt", sep='\t', index=False)
dataset2_imputed.to_csv("Imputed_Mean_MissingData2.txt", sep='\t', index=False)
dataset3_imputed.to_csv("Imputed_Mean_MissingData3.txt", sep='\t', index=False)

Imputed Dataset 1:
       0     1         2         3         4     5     6     7     8     9   \
0   -0.11  0.02 -0.360000 -0.110000  0.480000 -0.20  0.27  0.29 -0.34 -0.05   
1   -0.30 -0.37 -0.180000 -0.090000 -0.160000 -0.16 -0.10 -0.09  0.46  0.32   
2    0.50  0.18  0.410000  0.007404 -0.014589  0.15 -0.25 -0.41 -0.07 -0.13   
3    0.00 -0.11 -0.017806  0.190000  0.000000 -0.07  0.18  0.18 -0.23 -0.16   
4    0.40 -0.16  0.310000 -0.340000  0.020000  0.21 -0.70  0.11  0.08  0.05   
..    ...   ...       ...       ...       ...   ...   ...   ...   ...   ...   
237 -0.45 -0.31 -0.260000 -0.740000 -0.060000  0.09 -0.37  0.38  0.35  0.45   
238  0.12 -0.27  0.030000  0.360000  1.090000  0.41  0.15  0.01 -0.22 -0.01   
239  1.31  0.05 -0.170000  0.230000 -0.250000 -0.76 -0.06 -0.22  0.36 -0.17   
240  0.34  0.03 -0.120000 -0.030000  0.330000 -0.23 -0.08  0.20 -0.08  0.08   
241 -0.03 -0.03 -0.220000  0.060000 -0.030000 -0.31  0.01  0.22  0.46  0.24   

       10    11    12    13  
0 

### Second Method is the KNN (K-Nearest Neighbors) Method of Imputation

In [None]:
def knn_imputation(file_path, n_neighbors=5):
    # Read data from the file
    dataset = pd.read_csv(file_path, delimiter='\s+', header=None)

    # Replace 1.00000000000000e+99 with NaN for easier handling
    dataset.replace(1.00000000000000e+99, np.nan, inplace=True)

    # Impute missing values with K-Nearest Neighbors
    imputer = KNNImputer(n_neighbors=n_neighbors)
    dataset_imputed = pd.DataFrame(imputer.fit_transform(dataset), columns=dataset.columns)

    return dataset_imputed

# Example usage for Dataset 1 with KNN imputation
file_path_dataset1 = "MissingData1.txt"
dataset1_imputed_knn = knn_imputation(file_path_dataset1)
print("KNN Imputed Dataset 1:")
print(dataset1_imputed_knn)
print()

# Example usage for Dataset 2 with KNN imputation
file_path_dataset2 = "MissingData2.txt"  # Replace with the actual file path
dataset2_imputed_knn = knn_imputation(file_path_dataset2)
print("KNN Imputed Dataset 2:")
print(dataset2_imputed_knn)
print()

# Example usage for Dataset 3 with KNN imputation
file_path_dataset3 = "MissingData3.txt"  # Replace with the actual file path
dataset3_imputed_knn = knn_imputation(file_path_dataset3)
print("KNN Imputed Dataset 3:")
print(dataset3_imputed_knn)
print()

# Saving KNN Imputed Datasets
dataset1_imputed_knn.to_csv("Imputed_KNN_MissingData1.txt", sep='\t', index=False)
dataset2_imputed_knn.to_csv("Imputed_KNN_MissingData2.txt", sep='\t', index=False)
dataset3_imputed_knn.to_csv("Imputed_KNN_MissingData3.txt", sep='\t', index=False)


KNN Imputed Dataset 1:
       0     1      2      3      4     5     6     7     8     9     10  \
0   -0.11  0.02 -0.360 -0.110  0.480 -0.20  0.27  0.29 -0.34 -0.05  0.23   
1   -0.30 -0.37 -0.180 -0.090 -0.160 -0.16 -0.10 -0.09  0.46  0.32  0.02   
2    0.50  0.18  0.410  0.028  0.176  0.15 -0.25 -0.41 -0.07 -0.13 -0.15   
3    0.00 -0.11 -0.062  0.190  0.000 -0.07  0.18  0.18 -0.23 -0.16  0.03   
4    0.40 -0.16  0.310 -0.340  0.020  0.21 -0.70  0.11  0.08  0.05 -0.09   
..    ...   ...    ...    ...    ...   ...   ...   ...   ...   ...   ...   
237 -0.45 -0.31 -0.260 -0.740 -0.060  0.09 -0.37  0.38  0.35  0.45  0.13   
238  0.12 -0.27  0.030  0.360  1.090  0.41  0.15  0.01 -0.22 -0.01 -0.09   
239  1.31  0.05 -0.170  0.230 -0.250 -0.76 -0.06 -0.22  0.36 -0.17 -0.46   
240  0.34  0.03 -0.120 -0.030  0.330 -0.23 -0.08  0.20 -0.08  0.08 -0.12   
241 -0.03 -0.03 -0.220  0.060 -0.030 -0.31  0.01  0.22  0.46  0.24 -0.42   

       11    12    13  
0    0.08 -0.45  0.25  
1    0.31  0.40 