In [7]:
import pandas as pd
import pickle
import numpy as np

# Step 1: Load the saved model, scaler, label encoder, and PCA
try:
    with open("rf_model.pkl", "rb") as file:
        model = pickle.load(file)
    with open("scaler.pkl", "rb") as file:
        scaler = pickle.load(file)
    with open("label_encoder.pkl", "rb") as file:
        le = pickle.load(file)
    with open("pca.pkl", "rb") as file:
        pca = pickle.load(file)
except FileNotFoundError as e:
    print(f"Error: Missing required file - {e}")
    exit()

# Step 2: Read the input data
try:
    data = pd.read_csv('new_patient_3.csv')
except FileNotFoundError:
    print("Error: data.csv file not found.")
    exit()

# Rename the first column to 'sample_id' to match the training setup
data.rename(columns={data.columns[0]: "sample_id"}, inplace=True)

# Step 3: Validate the CSV has exactly one row
if len(data) == 0:
    print("Error: The CSV file is empty.")
    exit()
elif len(data) != 1:
    print(f"Warning: Expected 1 row in data.csv, but found {len(data)} rows. Using the first row.")
row_index = 0  # Use the only available row
single_row = data.iloc[[row_index]]  # Keep DataFrame structure

# Step 4: Preprocess the row
# Drop 'sample_id' for prediction (as it was not used in training features)
X_single = single_row.drop(['sample_id'], axis=1)

# Validate column count
expected_columns = 20531  # Number of gene columns expected from training
if X_single.shape[1] != expected_columns:
    print(f"Error: Expected {expected_columns} gene columns, but got {X_single.shape[1]}.")
    exit()

# Scale and apply PCA transformation
X_single_scaled = scaler.transform(X_single)
X_single_pca = pca.transform(X_single_scaled)

# Step 5: Make prediction for the single row
y_pred_encoded = model.predict(X_single_pca)

# Step 6: Convert encoded prediction to class name
y_pred = le.inverse_transform(y_pred_encoded)

# Step 7: Output the prediction
sample_id = single_row['sample_id'].values[0]
print(f"Prediction for sample_id '{sample_id}': {y_pred[0]}")

Prediction for sample_id 'sample_125': PRAD
