<a href="https://colab.research.google.com/github/TummalaSharmila/Adv-Seminar-Project/blob/main/Untitled0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

# Load a manageable sample to avoid memory issues
df = pd.read_csv("Dataset.csv", nrows=10000, low_memory=False)

print("Sample loaded. Shape:", df.shape)
df.head()


In [None]:
# Drop columns with more than 30% missing data
df = df.dropna(axis=1, thresh=int(0.7 * len(df)))

# Drop rows with more than 30% missing data
df = df.dropna(thresh=int(0.7 * df.shape[1]))

print("After cleaning missing values. Shape:", df.shape)


In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoders = {}

for col in df.select_dtypes(include='object'):
    df[col] = df[col].astype(str)
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

print("Categorical encoding complete.")


In [None]:
import numpy as np

for col in df.select_dtypes(include=[np.number]).columns:
    median_val = df[col].median()
    df[col] = df[col].fillna(median_val)

print("Filled missing numeric values with column medians.")


In [None]:
from sklearn.preprocessing import StandardScaler

# Drop targets from features
X = df.drop(columns=['OS', 'OS_time'], errors='ignore')
y_class = df['OS']
y_time = df['OS_time']

# Normalize numeric features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print("Feature scaling complete.")


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_class, test_size=0.2, random_state=42)
print("Train/test split complete.")


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

# Random Forest
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
pred_rf = rf.predict(X_test)

# SVM
svm = SVC(probability=True)
svm.fit(X_train, y_train)
pred_svm = svm.predict(X_test)

# XGBoost
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb.fit(X_train, y_train)
pred_xgb = xgb.predict(X_test)



In [None]:
from sklearn.metrics import accuracy_score, classification_report

models = {
    "Random Forest": (rf, pred_rf),
    "SVM": (svm, pred_svm),
    "XGBoost": (xgb, pred_xgb),
}

best_model = None
best_score = 0

for name, (model, pred) in models.items():
    acc = accuracy_score(y_test, pred)
    print(f"{name} Accuracy: {acc:.4f}")
    print(classification_report(y_test, pred))
    if acc > best_score:
        best_model = model
        best_score = acc

print(f"\n✅ Best model selected: {type(best_model).__name__} with accuracy {best_score:.4f}")


In [None]:
def predict_patient_outcome(new_data: dict):
    import pandas as pd

    patient_df = pd.DataFrame([new_data])

    # Encode
    for col, le in label_encoders.items():
        if col in patient_df:
            patient_df[col] = le.transform([str(patient_df[col][0])])

    # Fill missing columns
    for col in X.columns:
        if col not in patient_df:
            patient_df[col] = X[col].median()
    patient_df = patient_df[X.columns]  # Reorder

    # Scale
    patient_scaled = scaler.transform(patient_df)

    # Predict
    pred = xgb.predict(patient_scaled)
    proba = xgb.predict_proba(patient_scaled)[0][pred[0]]

    print("\n=== NEW PATIENT PREDICTION ===")
    print("Prediction:", "Survived" if pred[0] == 1 else "Deceased")
    print(f"Confidence: {proba*100:.2f}%")



In [None]:
example_patient = {
    'age': 62,
    'gender': 'female',
    'cancer_type_abbreviation': 'BRCA',
    'Mutation_Count': 37,
    'ENSG00000242268.2': 5.6,
    'ENSG00000259041.1': 3.2,
    # Add other features as needed based on your dataset
}

predict_patient_outcome(example_patient)
