In [None]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from textblob import TextBlob


# Custom transformer to convert all inputs to string
class StringConverter(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return np.array(X, dtype=str)

# Custom transformer to extract sentiment from text
class TextSentimentExtractor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Assuming X is a series of text entries
        return np.array([TextBlob(text).sentiment.polarity for text in X]).reshape(-1, 1)

# Load data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')


# Identify text and numeric columns
text_columns = [col for col in train.columns if train[col].dtype == 'object']
num_columns = [col for col in train.columns if train[col].dtype in ['int64', 'float64'] and col != 'matched_score']

# Fill missing values for safety
train.fillna('missing', inplace=True)
test.fillna('missing', inplace=True)

# Create a single text feature by concatenating all text columns
train['combined_text'] = train[text_columns].apply(lambda x: ' '.join(x.astype(str)), axis=1)
test['combined_text'] = test[text_columns].apply(lambda x: ' '.join(x.astype(str)), axis=1)

# Define preprocessing for numeric columns and the combined text column
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), num_columns),
        ('txt', TfidfVectorizer(), 'combined_text'),
        ('sentiment', TextSentimentExtractor(), 'combined_text')
    ]
)

# Preprocess the data
# Assuming 'train' is your entire dataset and you want to split it
X = train.drop(columns=['matched_score'])  # Features
y = train['matched_score']  # Target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Now apply preprocessing to the train and test set
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

import optuna
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score

# Objective function for regression with XGBoost
def objective_xgb(trial):
    # Hyperparameters to be tuned
    params = {
        'objective': 'reg:squarederror',  # Objective for regression
        'max_depth': trial.suggest_int('max_depth', 15, 20),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.03, 0.04),
        'n_estimators': trial.suggest_int('n_estimators', 350, 600),
        'subsample': trial.suggest_float('subsample', 0.85, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 0.5),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.85, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.85, 1.0),
        'verbosity': 1,  # Adjust verbosity for XGBoost
        'random_state': 42
    }

    # Initialize the XGBoost regressor
    model = XGBRegressor(**params)

    # Evaluate the model using cross-validation with MSE
    scores = cross_val_score(model, X_train_processed, y_train, cv=5, scoring='neg_mean_squared_error')  # Negative MSE to maximize
    trial.set_user_attr("mse", -scores.mean())  # Log the MSE for this trial
    return -scores.mean()  # Return positive MSE for minimization

# Enable detailed logging
def logging_callback(study, trial):
    print(f"Trial {trial.number}: Best value so far = {study.best_value}, Parameters = {trial.params}")

# Optuna optimization
study = optuna.create_study(direction='minimize')  # Minimize MSE
study.optimize(objective_xgb, n_trials=30, callbacks=[logging_callback])  # Add logging callback

# Best hyperparameters
best_params = study.best_params
print('Best Parameters:', best_params)

# Train the model with the best hyperparameters
best_model = XGBRegressor(**best_params)
best_model.fit(X_train_processed, y_train)

# Predict on the test set
test_predictions = best_model.predict(X_test_processed)

# Calculate and print MSE on the test set
final_mse = mean_squared_error(y_test, test_predictions)
print(f"Final MSE on the test set: {final_mse}")