In [None]:
import warnings
import pandas as pd
import numpy as np
from sklearn.ensemble import StackingRegressor, RandomForestRegressor
from sklearn.linear_model import Lasso, Ridge, HuberRegressor, LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

warnings.filterwarnings("ignore")

data_train = pd.read_csv("train.csv")
data_test = pd.read_csv("test.csv")
data_sample = pd.read_csv("sample_submission.csv")

def cols(data):


    data['col1'] = (data['fruitmass'] + data['fruitset']) / 2
    data['col2'] = (data['MinOfUpperTRange'] + data['MinOfLowerTRange']) / 2
    data['col3'] = (data['MaxOfUpperTRange'] + data['MaxOfLowerTRange']) / 2
    data['col4'] = data['AverageOfUpperTRange'] ** 2
    data['col5'] = data['seeds'] ** 2
    data['col6'] = data['bumbles'] ** 2
    data['col7'] = (data['fruitmass'] - data['fruitset']) ** 2

    return data



data = cols(data_train)

X = data.drop(["yield",'id','Row#'],axis=1)
y = data['yield']


date = cols(data_test)
data_test = date.drop(['id','Row#'],axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

estimators = [
    ('random_forest', RandomForestRegressor(n_estimators=300, min_samples_split=15, min_samples_leaf=10, max_features='log2', max_depth=9,criterion='absolute_error')),
    ('lasso', Lasso(alpha=0.1)),
    ('huber', HuberRegressor(epsilon=1.4, alpha=0.01)),
    ('linear', LinearRegression())
]

stacking_regressor = StackingRegressor(
    estimators=estimators,
    final_estimator=RandomForestRegressor(n_estimators=300, min_samples_split=15, min_samples_leaf=10, max_features='log2', max_depth=9,criterion='absolute_error'))




pipeline = Pipeline([
    ('stacking', stacking_regressor),
])

cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='neg_mean_absolute_error')
print("Cross-Validation MAE Scores:", -cv_scores)
print("Average Cross-Validation MAE Score:", -cv_scores.mean())

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)


unique_yield_values = data_train["yield"].unique()

y_pred = [unique_yield_values[np.argmin(np.abs(unique_yield_values - p))] for p in y_pred]

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("Mean Absolute Error (MAE):", mae)
print("R^2 Score:", r2)


y_test_prob_rf = pipeline.predict(data_test)

y_test_prob_rf = [unique_yield_values[np.argmin(np.abs(unique_yield_values - p))] for p in y_test_prob_rf]

subm = pd.read_csv("sample_submission.csv")
subm['yield'] = y_test_prob_rf
subm.to_csv("submission_colab.csv", index=False)
