In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score

# Load dataset
df = pd.read_csv("/content/drive/MyDrive/ml project /Disease_train.csv")

# Display the first few rows of the dataset
print(df.head())

# Check for missing values
missing_values = df.isnull().sum()
print("Missing values in each column:")
print(missing_values)

# Extract features and target variable
X = df.drop(['patient_id', 'diagnosis'], axis=1)
y = df['diagnosis']

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply SMOTE to handle class imbalance
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

# Split the resampled data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Initialize the RandomForestClassifier
model = RandomForestClassifier(random_state=42)

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='roc_auc', n_jobs=-1, verbose=2)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best model from grid search
best_model = grid_search.best_estimator_

# Predict on the validation set
y_val_pred = best_model.predict(X_val)

# Calculate the ROC-AUC score
roc_auc = roc_auc_score(y_val, y_val_pred)
print(f'Validation ROC-AUC Score with Optimized RandomForestClassifier: {roc_auc}')


   feature_1  feature_2  feature_3  feature_4  feature_5  feature_6  \
0   0.374540   0.950714   0.731994   0.598658   0.156019   0.155995   
1   0.020584   0.969910   0.832443   0.212339   0.181825   0.183405   
2   0.611853   0.139494   0.292145   0.366362   0.456070   0.785176   
3   0.388677   0.271349   0.828738   0.356753   0.280935   0.542696   
4   0.772245   0.198716   0.005522   0.815461   0.706857   0.729007   

   feature_7  feature_8  feature_9  feature_10  patient_id  diagnosis  
0   0.058084   0.866176   0.601115    0.708073           1          0  
1   0.304242   0.524756   0.431945    0.291229           2          0  
2   0.199674   0.514234   0.592415    0.046450           3          0  
3   0.140924   0.802197   0.074551    0.986887           7          0  
4   0.771270   0.074045   0.358466    0.115869           8          0  
Missing values in each column:
feature_1     0
feature_2     0
feature_3     0
feature_4     0
feature_5     0
feature_6     0
feature_7     

In [None]:
test_df = pd.read_csv("/content/drive/MyDrive/ml project /Disease_test.csv")

X_test = test_df.drop('patient_id', axis=1)

# Standardize the test features
X_test_scaled = scaler.transform(X_test)


# Make predictions on the test dataset
test_predictions = best_model.predict(X_test_scaled)

# Create a DataFrame with the predictions
predictions_df = pd.DataFrame({
    'patient_id': test_df['patient_id'],
    'prediction': test_predictions
})

# Print the predictions
print("\nTest Predictions:")
print(predictions_df.head())

# Save the predictions to a CSV file
output_path = '/content/drive/MyDrive/SE22UCSE133_predictions.csv'
predictions_df.to_csv(output_path, index=False)

import os

# Confirm the file is saved
if os.path.exists(output_path):
    print(f"File saved successfully at: {output_path}")
else:
    print("File not found.")


Test Predictions:
   patient_id  prediction
0           4           0
1           5           0
2           6           0
3          10           0
4          12           0
File saved successfully at: /content/drive/MyDrive/SE22UCSE133_predictions.csv


In [None]:
from google.colab import files
files.download(output_path)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from sklearn.model_selection import cross_val_score

# 5-Fold Cross-Validation
k = 5
scores = cross_val_score(model, X, y, cv=k)
# Mean and standard deviation
mean_score = scores.mean()
std_dev = scores.std()

# Display the scores
print(f'Cross-Validation Scores: {scores}')
print(f'Mean Score: {scores.mean()}')
print(f'Standard Deviation: {scores.std()}')


Cross-Validation Scores: [0.95125 0.95125 0.95125 0.95    0.95   ]
Mean Score: 0.95075
Standard Deviation: 0.0006123724356958359
