In [4]:
# Import necessary libraries
import numpy as np
import pandas as pd
from collections import Counter

# Load and preprocess data
def load_data(file_path, has_labels=True):
    """
    Load data from a text file.
    Parameters:
        file_path: Path to the file.
        has_labels: Whether the data includes labels (last column).
    Returns:
        features: Numpy array of feature values.
        labels: List of labels (if applicable).
    """
    data = []
    labels = []
    with open(file_path, 'r') as file:
        for line in file:
            values = line.strip().split()
            try:
                # Skip the first column (gene identifiers) and parse features
                features = list(map(float, values[1:-1])) if has_labels else list(map(float, values[1:]))
                data.append(features)
                if has_labels:
                    labels.append(values[-1])  # Extract the label from the last column
            except ValueError:
                print(f"Skipping invalid line: {line.strip()}")
    return np.array(data), labels

# Compute Euclidean distance
def euclidean_distance(point1, point2):
    """
    Compute the Euclidean distance between two points.
    Parameters:
        point1, point2: Numpy arrays of the points.
    Returns:
        Euclidean distance as a float.
    """
    return np.sqrt(np.sum((point1 - point2) ** 2))

# Predict the label using KNN
def predict_label(test_point, train_data, train_labels, k):
    """
    Predict the label for a test point based on the K-nearest neighbors.
    Parameters:
        test_point: Features of the test point.
        train_data: Features of training data.
        train_labels: Labels of training data.
        k: Number of nearest neighbors to consider.
    Returns:
        Predicted label as a string.
    """
    # Calculate distances from the test point to all training points
    distances = [(euclidean_distance(test_point, train_data[i]), train_labels[i])
                 for i in range(len(train_data))]
    distances.sort(key=lambda x: x[0])  # Sort by distance
    k_nearest_labels = [label for _, label in distances[:k]]  # Get labels of k nearest neighbors
    most_common = Counter(k_nearest_labels).most_common(1)  # Find the most common label
    return most_common[0][0]

# KNN Implementation
def knn(train_file, test_file, k, output_file):
    """
    Main KNN function to predict labels for the test data.
    Parameters:
        train_file: Path to training data file.
        test_file: Path to test data file.
        k: Number of neighbors to use.
        output_file: File to save the predictions.
    """
    # Load data
    train_data, train_labels = load_data(train_file, has_labels=True)
    test_data, _ = load_data(test_file, has_labels=False)

    # Predict labels for all test points
    predictions = [predict_label(test_point, train_data, train_labels, k) for test_point in test_data]

    # Save predictions to the output file
    with open(output_file, 'w') as file:
        for prediction in predictions:
            file.write(f"{prediction}\n")
    print(f"Predictions saved to {output_file}")

# Parameters
train_file = 'traindata.txt'  # actual path to training data
test_file = 'testdata.txt'       # actual path to test data
k = 3                        # Set the value of k
output_file = 'predictions.txt'  # Output file for predictions

# Run the KNN function
knn(train_file, test_file, k, output_file)


Skipping invalid line: 228780_at	203824_at	209847_at	219271_at	228912_at	218687_s_at	205799_s_at	201839_s_at	203903_s_at	206430_at	201884_at	209211_at	212925_at	215046_at	205506_at	202950_at	222291_at	237328_at	214898_x_at	228256_s_at	tissue
Skipping invalid line: 228780_at	203824_at	209847_at	219271_at	228912_at	218687_s_at	205799_s_at	201839_s_at	203903_s_at	206430_at	201884_at	209211_at	212925_at	215046_at	205506_at	202950_at	222291_at	237328_at	214898_x_at	228256_s_at
Predictions saved to predictions.txt
