In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.model_selection import train_test_split

# Load datasets
root = "/kaggle/input/hackathon-uno-2024-iitmz/"
train = pd.read_csv(root + 'train.csv')
test = pd.read_csv(root + 'test.csv')

# Map the target labels directly
disposition_mapping = {'CONFIRMED': 0, 'CANDIDATE': 1, 'FALSE POSITIVE': 2}
train['Disposition'] = train['Disposition'].map(disposition_mapping)

# Drop non-features
X_train = train.drop(columns=['row_id', 'Disposition'])
y_train = train['Disposition']
test_row_ids = test['row_id']
X_test = test.drop(columns=['row_id'])

# Handle missing values + scale data
imputer = SimpleImputer(strategy='mean')
scaler = StandardScaler()
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)

# Apply transformations
X_train = scaler.fit_transform(imputer.fit_transform(X_train))
X_test = scaler.transform(imputer.transform(X_test))
X_train = poly.fit_transform(X_train)
X_test = poly.transform(X_test)

# Improved feature selection
selector = SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=42))
X_train = selector.fit_transform(X_train, y_train)
X_test = selector.transform(X_test)

# Train/Test split for validation
X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)

# Define the ensemble models
rf = RandomForestClassifier(
    n_estimators=870,            # Number of trees
    max_depth=42,                # Maximum tree depth
    min_samples_split=5,         # Minimum samples to split a node
    min_samples_leaf=2,          # Minimum samples per leaf node
    random_state=42
)

et = ExtraTreesClassifier(
    n_estimators=500,            # Number of trees
    max_depth=35,                # Maximum tree depth
    min_samples_split=4,         # Minimum samples to split a node
    min_samples_leaf=2,          # Minimum samples per leaf node
    random_state=42
)

# Define the Voting Classifier
voting_clf = VotingClassifier(
    estimators=[('rf', rf), ('et', et)],
    voting='soft'
)

# Fit the Voting Classifier
voting_clf.fit(X_train_split, y_train_split)

# Predict on validation set
y_val_pred = voting_clf.predict(X_val)

# Evaluate the model performance
print("Voting Classifier Performance on Validation Set:")
print(confusion_matrix(y_val, y_val_pred))
print(classification_report(y_val, y_val_pred))
voting_macro_f1 = f1_score(y_val, y_val_pred, average='macro')
print(f"Voting Classifier Macro F1 Score: {voting_macro_f1:.4f}")

# Predict on the test data
test_predictions = voting_clf.predict(X_test)

# Reverse mapping for submission
reverse_mapping = {0: 'CONFIRMED', 1: 'CANDIDATE', 2: 'FALSE POSITIVE'}
test_data = test.copy()
test_data['Disposition'] = test_predictions
test_data['Disposition'] = test_data['Disposition'].map(reverse_mapping)

# Create and save the submission file
submission = pd.DataFrame({
    'row_id': test_row_ids,
    'Disposition': test_data['Disposition']
})
submission.to_csv('submission.csv', index=False)
print(f"Submission file saved as: submission.csv")
