In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load dataset
df = pd.read_csv('train.csv')  # Adjust the path to your dataset

# Step 1: Check for missing values
print("Missing values in each column:\n", df.isnull().sum())

# Step 2: Encode categorical variables
label_encoder = LabelEncoder()
df['Marital_Status'] = label_encoder.fit_transform(df['Marital_Status'])
df['Radiation_Therapy'] = df['Radiation_Therapy'].map({'Yes': 1, 'No': 0})
df['Chemotherapy'] = df['Chemotherapy'].map({'Yes': 1, 'No': 0})
df['Hormone_Therapy'] = df['Hormone_Therapy'].map({'Yes': 1, 'No': 0})

# Step 3: Define features and target
X = df.drop(columns=['Patient_ID', 'Survival_Status'])
y = df['Survival_Status']

# Step 4: Normalize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 5: Add polynomial features
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_poly = poly.fit_transform(X_scaled)

# Step 6: Reconfirm Train-test split
X_train_poly, X_test_poly, y_train, y_test = train_test_split(X_poly, y, test_size=0.2, random_state=42)

# Step 7: Define the parameter grid for Random Forest
param_dist = {
    'n_estimators': [100, 200, 300, 400],
    'max_features': ['sqrt', 'log2'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Step 8: Use RandomizedSearchCV for hyperparameter tuning
rf = RandomForestClassifier(random_state=42)
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist, 
                                   n_iter=100, cv=5, verbose=2, random_state=42, n_jobs=-1)
random_search.fit(X_train_poly, y_train)

# Step 9: Get the best model and fit it
best_rf = random_search.best_estimator_
best_rf.fit(X_train_poly, y_train)

# Step 10: Evaluate the model
y_pred_poly = best_rf.predict(X_test_poly)
print(f'Accuracy: {accuracy_score(y_test, y_pred_poly)}')
print(f'Precision: {precision_score(y_test, y_pred_poly)}')
print(f'Recall: {recall_score(y_test, y_pred_poly)}')
print(f'F1 Score: {f1_score(y_test, y_pred_poly)}')
