In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

# Load data
features = pd.read_csv('train.csv').sample(frac=0.1, random_state=0)  
targets = pd.read_csv('train_labels.csv')

# Convert 'Id' to str type to match the data type of 'session_id'
features['Id'] = features['Id'].astype(str)

# Merge datasets
data = pd.merge(features, targets, left_on='Id', right_on='session_id', how='right')

# Separate into features and target variable
X = data.drop(columns=['session_id', 'correct'])  
y = data['correct']

# Handle preprocessing for numerical and categorical features
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns.difference(['Id', 'session_id'])  

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Model definition
model = XGBClassifier()

# Create a pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

# Reduced grid search parameters
param_grid = {
    'model__learning_rate': [0.1],  # Reduced the options
    'model__n_estimators': [100],   # Reduced the options
    'model__max_depth': [3, 5]
}

# Perform grid search
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, verbose=1, scoring='f1')

In [3]:
# Fit the grid search to the data
grid_search.fit(X, y)

# Print the best parameters and score
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

# Make predictions with the best parameters
y_pred = grid_search.best_estimator_.predict(X)

# Calculate F1 score
f1 = f1_score(y, y_pred)
print(f"F1 Score on training data: {f1}")

# Load the test data (adjust the filename as per your dataset)
test_data = pd.read_csv('test.csv')

# Make predictions on the test data
test_predictions = grid_search.best_estimator_.predict(test_data)

# For instance, saving the predictions to a CSV file
submission = pd.DataFrame({'Id': test_data['Id'], 'Prediction': test_predictions})
submission.to_csv('submission.csv', index=False)

print("Predictions saved to submission.csv")

Fitting 5 folds for each of 2 candidates, totalling 10 fits
