In [None]:
### Regression

from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt


### Model Part --> Regression

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.neural_network import MLPRegressor

from sklearn.pipeline import Pipeline



### Regression
models = [
    ('Linear Regression', LinearRegression()),
    
    # Uncomment to add more models as needed
    # ('Decision Tree Regressor', DecisionTreeRegressor()),
    # ('Random Forest Regressor', RandomForestRegressor()),

    ('Gradient Boosting Regressor', GradientBoostingRegressor()),
    ('XGBoost Regressor', XGBRegressor()),
    ('LightGBM Regressor', LGBMRegressor()),
    ('CatBoost Regressor', CatBoostRegressor(verbose=0))  # Set verbose=0 to suppress output
]



# Iterate through models, creating a pipeline for each
for name, model in models.items():
    # Create the pipeline with preprocessing and model
    pipeline = Pipeline(steps=[('regressor', model)])

    # Fit the model
    pipeline.fit(x_train_scaled, y_train)

    # Make predictions on the training set
    y_train_pred = pipeline.predict(x_train_scaled)

    # Calculate regression metrics for the training set
    mse_train = mean_squared_error(y_train, y_train_pred)
    mae_train = mean_absolute_error(y_train, y_train_pred)
    r2_train = r2_score(y_train, y_train_pred)

    # Print metrics for the training set
    print(f'{name} Train MSE: {mse_train:.4f}')
    print(f'{name} Train MAE: {mae_train:.4f}')
    print(f'{name} Train R²: {r2_train:.4f}')

    # Make predictions on the test set
    y_test_pred = pipeline.predict(x_test_scaled)

    # Calculate regression metrics for the test set
    mse_test = mean_squared_error(y_test, y_test_pred)
    mae_test = mean_absolute_error(y_test, y_test_pred)
    r2_test = r2_score(y_test, y_test_pred)

    # Print metrics for the test set
    print(f'{name} Test MSE: {mse_test:.4f}')
    print(f'{name} Test MAE: {mae_test:.4f}')
    print(f'{name} Test R²: {r2_test:.4f}')

    # Plot predictions vs actual values for the test set
    plt.figure(figsize=(8, 6))
    plt.scatter(y_test, y_test_pred, alpha=0.6, color='blue')
    plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--')
    plt.xlabel('Actual Values')
    plt.ylabel('Predicted Values')
    plt.title(f'{name} Predictions vs Actual Values (Test Set)')
    plt.grid(True)
    plt.show()







# Define the objective function for Optuna
def objective(trial):
    # Choose which model to optimize
    model_name = trial.suggest_categorical("model", ["GradientBoosting", "XGBoost", "LightGBM", "CatBoost"])

    if model_name == "GradientBoosting":
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 50, 500),
            "max_depth": trial.suggest_int("max_depth", 3, 10),
            "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.3),
            "subsample": trial.suggest_uniform("subsample", 0.5, 1.0)
        }
        model = GradientBoostingRegressor(**params)

    elif model_name == "XGBoost":
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 50, 500),
            "max_depth": trial.suggest_int("max_depth", 3, 10),
            "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.3),
            "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
            "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0)
        }
        model = XGBRegressor(**params, random_state=42)

    elif model_name == "LightGBM":
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 50, 500),
            "max_depth": trial.suggest_int("max_depth", -1, 10),
            "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.3),
            "num_leaves": trial.suggest_int("num_leaves", 20, 150),
            "subsample": trial.suggest_uniform("subsample", 0.5, 1.0)
        }
        model = LGBMRegressor(**params, random_state=42)

    elif model_name == "CatBoost":
        params = {
            "iterations": trial.suggest_int("iterations", 100, 500),
            "depth": trial.suggest_int("depth", 3, 10),
            "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.3),
        }
        model = CatBoostRegressor(**params, random_state=42, verbose=0)

    # Train the model
    model.fit(x_train_scaled, y_train)

    # Predict on the test set
    y_pred = model.predict(x_test_scaled)

    # Evaluate using Mean Squared Error
    mse = mean_squared_error(y_test, y_pred)
    return mse

# Create the Optuna study and optimize
study = optuna.create_study(direction="minimize", sampler=TPESampler())
study.optimize(objective, n_trials=50)

# Print the best trial
print("Best trial:")
print(f"MSE: {study.best_value:.4f}")
print("Best hyperparameters:", study.best_trial.params)


##########################################################################################################


param_rest = {
    'objective': 'RMSE',  # Changed to RMSE (Root Mean Squared Error) for regression
    'colsample_bylevel': 0.09130944091779239,
    'depth': 11,
    'boosting_type': 'Plain',
    'learning_rate': 0.08515526764930864,
    'bootstrap_type': 'MVS',
    'min_data_in_leaf': 100
}


# Initialize the final model for regression
final_model_new = CatBoostRegressor(logging_level='Silent', **param_rest, random_state=212, iterations=1000)

# Fit the model on the training data
final_model_new.fit(X_train_new, y_train_resample)

# Predict on the test set
y_pred = final_model_new.predict(X_test_new)

# Evaluate the model using regression metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print the results
print(f"Mean Squared Error: {mse:.4f}")
print(f"R^2 Score: {r2:.4f}")

##########################################################################################################

best_model = study.best_trial.user_attrs['best_model']

feature_importance = best_model.get_feature_importance()

plt.figure(figsize=(10, 6))
plt.barh(range(len(feature_importance)), feature_importance, align="center")
plt.yticks(range(len(feature_importance)), [f"Feature {i}" for i in range(len(feature_importance))])
plt.xlabel('Feature Importance')
plt.ylabel('Features')
plt.title('CatBoost Feature Importance')
plt.show()


##########################################################################
## PCA

from sklearn.decomposition import PCA


pca = PCA(n_components=0.95)  
X_pca = pca.fit_transform(X_scaled)

print(f"Explained variance ratio: {pca.explained_variance_ratio_}")
print(f"Total variance explained by the selected components: {np.sum(pca.explained_variance_ratio_)}")

if X_pca.shape[1] == 2:
    plt.figure(figsize=(8, 6))
    plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='viridis')
    plt.colorbar()
    plt.xlabel('First Principal Component')
    plt.ylabel('Second Principal Component')
    plt.title('PCA of Iris Dataset (95% Variance Retained)')
    plt.show()
else:
    print(f"Number of components retained: {X_pca.shape[1]}")

#####################################################################
### Cross Validation

from sklearn.model_selection import cross_val_score
model = LogisticRegression(max_iter=200)
cv_scores = cross_val_score(model, X_scaled, y, cv=5)



### Feture importance

# Get feature importances
importances = model.feature_importances_

# Display the feature importances
features = data.feature_names
importance_df = pd.DataFrame({'Feature': features, 'Importance': importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)
print(importance_df)


##################################################################################################################################################


from sklearn.feature_selection import mutual_info_classif

# Assuming X is your features and y is the target
mi = mutual_info_classif(X, y)

# Display the mutual information scores
importance_df = pd.DataFrame({'Feature': X.columns, 'Mutual Information': mi})
importance_df = importance_df.sort_values(by='Mutual Information', ascending=False)
print(importance_df)



from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

# Assuming X and y are your features and target variables
model = LogisticRegression(max_iter=200)
rfe = RFE(model, n_features_to_select=1)  # Rank all features
rfe.fit(X, y)

# Get ranking of features
ranking = rfe.ranking_

# Display the ranked features
importance_df = pd.DataFrame({'Feature': X.columns, 'Ranking': ranking})
importance_df = importance_df.sort_values(by='Ranking', ascending=True)
print(importance_df)

