<a href="https://colab.research.google.com/github/UznetDev/Data-science-home-work/blob/main/06_Noy_2024_home_work.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import

In [None]:
!pip install dill



In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.inspection import permutation_importance
from sklearn.ensemble import RandomForestRegressor
import joblib
import dill as pickle
import joblib
import matplotlib.pyplot as plt
import seaborn as sns

## Load data

In [None]:
train_df = pd.read_csv('data/train.csv').drop(['Row#', 'id'], axis=1)
test_df = pd.read_csv('data/test.csv')

train_df.head(1)

Unnamed: 0,clonesize,honeybee,bumbles,andrena,osmia,MaxOfUpperTRange,MinOfUpperTRange,AverageOfUpperTRange,MaxOfLowerTRange,MinOfLowerTRange,AverageOfLowerTRange,RainingDays,AverageRainingDays,fruitset,fruitmass,seeds,yield
0,12.5,0.25,0.25,0.25,0.75,69.7,42.1,58.2,50.2,24.3,41.2,16.0,0.26,0.477941,0.423927,34.043022,6079.08526


## Function's

In [None]:
def print_metrics(model, X_train, X_test, y_train, y_test, cv_mae):
    y_test_pred = model.predict(X_test)
    y_train_pred = model.predict(X_train)
    r2 = r2_score(y_test, y_test_pred)
    test_mae = mean_absolute_error(y_test, y_test_pred)
    train_mae = mean_absolute_error(y_train, y_train_pred)
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))

    text = f"""
    Model Performance Metrics:
    --------------------------
    CV Mean MAE: {-cv_mae.mean():.2f}, {-cv_mae.std():.2f}

           Train  | Test
         |-----------------|
     MAE | {train_mae:.2f} | {test_mae:.2f} |
         |-----------------|
    RMSE | {train_rmse:.2f} | {test_rmse:.2f} |
         |-----------------|

        R^2 Score (Test): {r2:.2f}
    """
    print(text)

## Feature Engenering

In [None]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.columns]

class FeatureEngineer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_transformed = X.copy()
        X_transformed['FruitToSeedRatio'] = X_transformed['fruitset'] / (X_transformed['seeds'] + 1e-5)
        X_transformed['fruitset_seeds'] = X_transformed['seeds'] * X_transformed['fruitset'] / 100
        X_transformed['fruitmass_seeds'] = X_transformed['seeds'] * X_transformed['fruitmass'] / 100
        X_transformed['fruitset_fruitmass_seeds'] = (
            X_transformed['seeds'] * X_transformed['fruitset'] * X_transformed['fruitmass'] / 10000
        )
        X_transformed['FruitSetToUpperTempRatio'] = (
            X_transformed['fruitset'] / (X_transformed['AverageOfUpperTRange'] + 1e-5)
        )
        return X_transformed.drop('AverageOfUpperTRange', axis=1)


class OutlierReplacer(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None):
        self.columns = columns

    def fit(self, X, y=None):
        if self.columns is None:
            self.columns = X.columns
        self.bounds_ = {}
        for column in self.columns:
            Q1 = X[column].quantile(0.25)
            Q3 = X[column].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            self.bounds_[column] = (lower_bound, upper_bound)
        return self

    def transform(self, X):
        X_transformed = X.copy()
        for column in self.columns:
            lower_bound, upper_bound = self.bounds_[column]
            X_transformed[column] = X_transformed[column].apply(
                lambda x: lower_bound if x < lower_bound else upper_bound if x > upper_bound else x
            )
        return X_transformed

In [None]:
X = train_df.drop(['yield'], axis=1)
y = train_df['yield']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Model

In [None]:
model = Pipeline([
    ('column_selector', ColumnSelector(columns=['seeds', 'fruitmass', 'fruitset', 'AverageOfUpperTRange'])),
    ('outlier_replacer', OutlierReplacer()),
    ('feature_engineer', FeatureEngineer()),
    ('model', RandomForestRegressor(max_depth=9,
                                 n_estimators=497,
                                 max_features=0.8092853952180284,
                                 min_samples_split=10,
                                 min_samples_leaf=4,
                                 criterion='absolute_error',
                                 random_state=42))
    ])
cv = KFold(n_splits=3, shuffle=True, random_state=42)
neg_mae = cross_val_score(model, X, y, cv=cv, scoring='neg_mean_absolute_error')

In [None]:
model.fit(X, y)

In [None]:
print_metrics(model, X_train, X_test, y_train, y_test, neg_mae)


    Model Performance Metrics:
    --------------------------
    CV Mean MAE: 240.90, -1.81

           Train  | Test
         |-----------------|
     MAE | 219.06 | 218.33 |
         |-----------------|
    RMSE | 345.68 | 347.27 |
         |-----------------|

        R^2 Score (Test): 0.94
    


In [None]:
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)