In [None]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from textblob import TextBlob
import optuna

In [None]:
# Custom transformer to convert all inputs to string
class StringConverter(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return np.array(X, dtype=str)

In [None]:
# Custom transformer to extract sentiment from text
class TextSentimentExtractor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Assuming X is a series of text entries
        return np.array([TextBlob(text).sentiment.polarity for text in X]).reshape(-1, 1)

In [None]:
# Load data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')

In [None]:
# Identify text and numeric columns
text_columns = [col for col in train.columns if train[col].dtype == 'object']
num_columns = [col for col in train.columns if train[col].dtype in ['int64', 'float64'] and col != 'matched_score']

In [None]:
# Fill missing values for safety
train.fillna('missing', inplace=True)
test.fillna('missing', inplace=True)

In [None]:
# Create a single text feature by concatenating all text columns
train['combined_text'] = train[text_columns].apply(lambda x: ' '.join(x.astype(str)), axis=1)
test['combined_text'] = test[text_columns].apply(lambda x: ' '.join(x.astype(str)), axis=1)

In [None]:
# Define preprocessing for numeric columns and the combined text column
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), num_columns),
        ('txt', TfidfVectorizer(), 'combined_text'),
        ('sentiment', TextSentimentExtractor(), 'combined_text')
    ]
)

In [None]:
# Preprocess the data
X_train = preprocessor.fit_transform(train)
y_train = train['matched_score']
X_test = preprocessor.transform(test)

In [None]:
# Optuna optimization for hyperparameter tuning
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10)
    }
    model = GradientBoostingRegressor(**params)
    score = -cross_val_score(model, X_train, y_train, n_jobs=-1, cv=5, scoring='neg_mean_squared_error').mean()
    return score


In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=10)


In [None]:
# Output the best parameters and the lowest MSE
best_params = study.best_params
print("Best hyperparameters for Gradient Boosting:", best_params)
print("Lowest MSE encountered during the study:", study.best_value)

In [None]:
# Train the model with the best parameters
best_model = GradientBoostingRegressor(**best_params)
best_model.fit(X_train, y_train)

In [None]:
# Predict on the test set
predictions = best_model.predict(X_test)

In [None]:
# Save predictions to the submission file
sample_submission['matched_score'] = predictions
sample_submission.to_csv('final_submission.csv', index=False)
print("Final submission saved.")