<a href="https://colab.research.google.com/github/TummalaSharmila/Adv-Seminar-Project/blob/main/chunk.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load data
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Copy of Dataset.csv", nrows=10000, low_memory=False)
print("Sample loaded. Shape:", df.shape)

# Drop rows with >30% missing values (keep columns)
df = df.dropna(thresh=int(0.7 * df.shape[1]))
print("After dropping incomplete rows. Shape:", df.shape)



In [None]:
# --- BLOCK BREAK ---

# Encode categorical columns
label_encoders = {}
for col in df.select_dtypes(include='object'):
    df[col] = df[col].astype(str)
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le
print("Categorical encoding complete.")

# Fill missing numeric columns with medians
for col in df.select_dtypes(include=[np.number]).columns:
    df[col] = df[col].fillna(df[col].median())
print("Missing values filled.")



In [None]:
# --- BLOCK BREAK ---

# Feature-target split
X = df.drop(columns=['OS', 'OS_time'], errors='ignore')
y_class = df['OS']
y_time = df.get('OS_time')  # won't raise error if not present

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print("Feature scaling complete.")

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_class, test_size=0.2, random_state=42)
print("Train/test split complete.")



In [None]:
# --- BLOCK BREAK ---

# Train models
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
pred_rf = rf.predict(X_test)

svm = SVC(probability=True)
svm.fit(X_train, y_train)
pred_svm = svm.predict(X_test)

xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb.fit(X_train, y_train)
pred_xgb = xgb.predict(X_test)



In [None]:
# --- BLOCK BREAK ---

# Evaluate and select best model
models = {
    "Random Forest": (rf, pred_rf),
    "SVM": (svm, pred_svm),
    "XGBoost": (xgb, pred_xgb),
}

best_model = None
best_score = 0
for name, (model, pred) in models.items():
    acc = accuracy_score(y_test, pred)
    print(f"{name} Accuracy: {acc:.4f}")
    print(classification_report(y_test, pred))
    if acc > best_score:
        best_model = model
        best_score = acc

print(f"\n✅ Best model selected: {type(best_model).__name__} with accuracy {best_score:.4f}")



In [None]:
# --- BLOCK BREAK ---

# Prediction function for new patient
def predict_patient_outcome(new_data: dict):
    patient_df = pd.DataFrame([new_data])

    # Encode categoricals
    for col, le in label_encoders.items():
        if col in patient_df:
            patient_df[col] = le.transform([str(patient_df[col][0])])

    # Add missing columns
    for col in X.columns:
        if col not in patient_df:
            patient_df[col] = X[col].median()
    patient_df = patient_df[X.columns]

    # Scale
    patient_scaled = scaler.transform(patient_df)

    # Predict
    pred = best_model.predict(patient_scaled)[0]
    proba = best_model.predict_proba(patient_scaled)[0][pred]

    print("\n=== NEW PATIENT PREDICTION ===")
    print("Prediction:", "Survived" if pred == 1 else "Deceased")
    print(f"Confidence: {proba*100:.2f}%")


In [None]:
# --- BLOCK BREAK ---

# Example prediction
example_patient = {
    'age': 60,
    'gender': 'female',
    'cancer_type_abbreviation': 'BRCA',
    'Mutation_Count': 34,
    'ENSG00000242268.2': 5.7,
    'ENSG00000259041.1': 2.8,
    # Add more features as needed
}

predict_patient_outcome(example_patient)
