In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load training dataset
df = pd.read_csv('train.csv')  # Adjust the path to your training dataset

# Explore dataset
print(df.head())
print(df.info())
print(df.describe())

# Encode labels
label_encoder = LabelEncoder()
df['diagnosis'] = label_encoder.fit_transform(df['diagnosis'])

# Define features and target
X = df.drop(columns=['id', 'diagnosis'])
y = df['diagnosis']

# Normalize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Model selection and training with Cross-Validation and Hyperparameter Tuning
log_reg = LogisticRegression()
rf = RandomForestClassifier()

# Use GridSearchCV to find the best parameters
param_grid = {
    'log_reg__C': [0.1, 1, 10],
    'rf__n_estimators': [50, 100, 200]
}

# Ensemble Model
voting_clf = VotingClassifier(estimators=[
    ('log_reg', log_reg),
    ('rf', rf)
], voting='soft')

grid_search = GridSearchCV(estimator=voting_clf, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_

# Make predictions on the validation set
y_pred = best_model.predict(X_test)

# Evaluation
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(f'Precision: {precision_score(y_test, y_pred)}')
print(f'Recall: {recall_score(y_test, y_pred)}')
print(f'F1 Score: {f1_score(y_test, y_pred)}')

# Load test dataset for submission
test_df = pd.read_csv('test.csv')  # Adjust the path to your test dataset

# Preprocess test dataset
test_ids = test_df['id']
X_test_submission = test_df.drop(columns=['id'])
X_test_submission_scaled = scaler.transform(X_test_submission)

# Make predictions on test dataset
y_test_pred = best_model.predict(X_test_submission_scaled)

# Map 0 and 1 to 'B' and 'M'
mapping = {0: 'B', 1: 'M'}
mapped_predictions = [mapping[val] for val in y_test_pred]

# Prepare submission file
submission = pd.DataFrame({'id': test_ids, 'Prediction': mapped_predictions})
submission.to_csv('submission.csv', index=False)

print("Submission file created successfully.")
