In [None]:
# Install Optuna
!pip install optuna

import numpy as np
import pandas as pd
import optuna
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import StackingRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.linear_model import ElasticNet
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer


# Load your data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')

# Preprocessing (drop columns and separate target)
drop_columns = ['address', 'educational_institution_name', 'degree_names', 'passing_years', 'educational_results', 'result_types', 'major_field_of_studies']
X = train.drop(columns=drop_columns + ['matched_score'])
y = train['matched_score']
X_test = test.drop(columns=drop_columns)

# Identify text columns
text_columns = [col for col in X.columns if X[col].dtype == 'object']
num_columns = X.select_dtypes(exclude=['object']).columns.tolist()

# Function to concatenate text data for Tfidf processing
def combine_text_columns(data_frame, to_combine):
    """ Combines all text columns into a single column """
    text_data = data_frame[to_combine].astype(str)
    text_data = text_data.apply(lambda x: ' '.join(x), axis=1)
    return text_data

# Preprocessing pipelines for numerical and text data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]), num_columns),
        ('txt', Pipeline([('combine', FunctionTransformer(combine_text_columns, kw_args={'to_combine': text_columns})), ('vectorizer', TfidfVectorizer())]), text_columns)
    ],
    remainder='passthrough'  # Ensure no column is dropped unintentionally
)

# Apply preprocessing
X_processed = preprocessor.fit_transform(X)
X_test_processed = preprocessor.transform(X_test)

print("Processed feature shape:", X_processed.shape)
print("Target shape:", y.shape)

# Ensure data consistency
assert X_processed.shape[0] == y.shape[0], "Mismatch in the number of samples between features and target"



# Variable to track the lowest MSE
lowest_mse = np.inf

# Optuna optimization for Gradient Boosting
def objective(trial):
    global lowest_mse
    gbm_params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10)
    }
    gbm = GradientBoostingRegressor(**gbm_params)
    score = cross_val_score(gbm, X_processed, y, n_jobs=-1, cv=5, scoring='neg_mean_squared_error').mean()
    mse = -score  # Negate to get positive MSE
    lowest_mse = min(lowest_mse, mse)
    print(f'Trial {trial.number}: MSE = {mse}, Lowest MSE so far = {lowest_mse}')
    return mse

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=30)

# Best hyperparameters and lowest MSE ever faced
best_gbm_params = study.best_params
print("Best hyperparameters for Gradient Boosting:", best_gbm_params)
print("Lowest MSE encountered during the study:", lowest_mse)

# Set up the optimized GBM in the stacking ensemble
estimators = [
    ('rf', RandomForestRegressor(n_estimators=100, random_state=42)),
    ('gbm', GradientBoostingRegressor(**best_gbm_params)),
    ('svr', make_pipeline(StandardScaler(with_mean=False), SVR(C=1, epsilon=0.1)))  # Fix with_mean=False
]
stack_reg = StackingRegressor(
    estimators=estimators,
    final_estimator=ElasticNet(alpha=0.1, l1_ratio=0.5, random_state=42)
)

# Train the stacked model
stack_reg.fit(X_processed, y)

# Prepare final predictions for submission
final_predictions = stack_reg.predict(X_test_processed)

# Save submission
sample_submission['matched_score'] = final_predictions
sample_submission.to_csv('finsubmission.csv', index=False)
print("Final submission saved.")
