In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.multioutput import MultiOutputClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import joblib

# Read train and test data
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

# Define features and target variable
X_train = train_data.drop(['id', 'Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults'], axis=1)
y_train = train_data[['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']]
test_ids = test_data['id']
test_features = test_data.drop("id", axis=1)

# Add statistical features
X_train['mean'] = X_train.mean(axis=1)
X_train['std'] = X_train.std(axis=1)
X_train['max'] = X_train.max(axis=1)
X_train['min'] = X_train.min(axis=1)

test_features['mean'] = test_features.mean(axis=1)
test_features['std'] = test_features.std(axis=1)
test_features['max'] = test_features.max(axis=1)
test_features['min'] = test_features.min(axis=1)

# Define classifiers
classifiers = {
    'Random Forest': RandomForestClassifier(),
    'LightGBM': LGBMClassifier(),
    'XGBoost': XGBClassifier(),
    'CatBoost': CatBoostClassifier(),
    'ExtraTrees': ExtraTreesClassifier()
}

# Cross-validation to evaluate each classifier
cv_scores = {}
for name, clf in classifiers.items():
    multi_clf = MultiOutputClassifier(clf, n_jobs=-1)
    cv_scores[name] = cross_val_score(multi_clf, X_train, y_train, cv=5, scoring='accuracy').mean()

# Select the best classifier
best_model_name = max(cv_scores, key=cv_scores.get)
best_classifier = classifiers[best_model_name]

print("Best model:", best_model_name)
print("CV Accuracy:", cv_scores[best_model_name])

# Train the best classifier on the entire training data
best_multi_classifier = MultiOutputClassifier(best_classifier, n_jobs=-1)
best_multi_classifier.fit(X_train, y_train)

# Save the best model
model=joblib.dump(best_multi_classifier, 'best_model2.joblib')

# Make predictions on the test data
test_probs = best_multi_classifier.predict_proba(test_features)

# Prepare submission file
submission2_df = pd.DataFrame({'id': test_ids})
for i, target in enumerate(y_train.columns):
    submission2_df[target] = test_probs[i][:, 1]

# Save submission file
submission2_df.to_csv('submission2.csv', index=False)
