<a href="https://colab.research.google.com/github/UznetDev/Aiogram-Bot-Template/blob/main/FEUTURE_SELECTION.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import *
from sklearn.model_selection import *
from sklearn.pipeline import *
from sklearn.compose import *
from sklearn.impute import *
from sklearn.preprocessing import *
from sklearn.ensemble import *
from sklearn.metrics import *
from sklearn.feature_selection import *
from sklearn.feature_extraction.text import *
from sklearn.decomposition import *
import warnings
warnings.filterwarnings('ignore')

In [5]:
def feature_funk(df):
    df['Surface Area'] = 2 * (df['Length'] * df['Diameter'] + df['Length'] * df['Height'] + df['Diameter'] * df['Height'])
    df['Vol'] = df['Length'] * df['Diameter'] * df['Height']
    df['Vol_Hgt'] = (4/3) * np.pi * (df['Length']/2) * (df['Diameter']/2) * (df['Height']/2)

    df['Log_Wt'] = df['Weight'].agg(lambda x: np.log(x + 1) if x > 0 else np.nan)
    df['Length Bins'] = pd.qcut(df['Length'], q=4, labels=False)

    df['Wt_Red_Aft_Shk'] = df['Weight'] - df['Shucked Weight']
    df['Vis_Wt_Prop'] = df['Viscera Weight'] / df['Weight']

    df['Len_to_Dia_Ratio'] = df['Length'] / df['Diameter']
    df['Hgt_to_Wt_Ratio'] = df['Height'] / df['Weight']

    sex = {'I': 1, 'F': 3, 'M': 2}
    df['Sex'] = df['Sex'].replace(sex)
    df = df[df['Sex'].isin([1, 2, 3])]
    # df.loc[:, 'Sex'] = pd.to_numeric(df['Sex'], errors='coerce')
    df['Sex'] = pd.to_numeric(df['Sex'], errors='coerce')

    df.dropna(subset=['Sex'], inplace=True)

    df['Len_x_Dia'] = df['Length'] * df['Diameter']

    return df

df = feature_funk(pd.read_csv('train.csv'))

In [None]:
X = df.drop(columns=['id', 'Age'])
y = df['Age']


preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ('poly', PolynomialFeatures(degree=2, include_bias=False, interaction_only=True)),
            ('scaler', StandardScaler())
        ]), X.columns),
        # ('cat', Pipeline([
        #     ('imputer', SimpleImputer(strategy='most_frequent')),
        #     ('encoder', OneHotEncoder(handle_unknown='ignore', drop='first'))
        # ]), categorical_cols)
    ])

base_models = [
    ('lr', LinearRegression(fit_intercept=True)),
    ('huber', HuberRegressor(alpha=0.0001)),
    ('ransac', RANSACRegressor(min_samples=0.5, residual_threshold=3.0)),
    ('theil_sen', TheilSenRegressor()),
    ('ridge', Ridge(alpha=1.0, fit_intercept=True)),
    ('lasso', Lasso(alpha=0.01, fit_intercept=True)),
    ('elasticnet', ElasticNet())
]

cv = KFold(n_splits=5, shuffle=True, random_state=42)

stacking_model = StackingRegressor(
    estimators=base_models,
    final_estimator=HuberRegressor(alpha=0.0001),
    cv=cv
)

selector = RFECV(HuberRegressor(alpha=0.0001), step=1, cv=cv)
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('selector', selector),
    ('model', stacking_model)

])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

cv_scores = cross_val_score(pipeline, X_train, y_train, cv=cv, scoring='neg_mean_absolute_error')


mean_cv_mae = -np.mean(cv_scores)
pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print(f'CV RMAE: {np.mean(-cv_scores):.4f}')
print(f'Test MAE: {mae:.4f}')
print(f'Test MSE: {mse:.4f}')
print(f'Test R^2 Score: {r2:.4f}')

In [None]:
train_df = pd.read_csv('train.csv')

X = train_df.drop(columns=['id', 'Age'])
y = train_df['Age']


categorical_cols = ['Sex']
numerical_cols = X.columns.difference(categorical_cols)


preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ('poly', PolynomialFeatures(degree=2, include_bias=False, interaction_only=True)),
            ('scaler', RobustScaler())
        ]), numerical_cols),
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder(handle_unknown='ignore', drop='first'))
        ]), categorical_cols)
    ])


# model = GradientBoostingRegressor(random_state=42)
base_models = [
    ('lr', LinearRegression(fit_intercept=True)),
    ('huber', HuberRegressor(alpha=0.0001)),
    ('ransac', RANSACRegressor(min_samples=0.5, residual_threshold=3.0)),
    ('theil_sen', TheilSenRegressor()),
    ('ridge', Ridge(alpha=1.0, fit_intercept=True)),
    ('lasso', Lasso(alpha=0.01, fit_intercept=True)),
    ('elasticnet', ElasticNet())
]

cv = KFold(n_splits=5, shuffle=True, random_state=42)

stacking_model = StackingRegressor(
    estimators=base_models,
    final_estimator=HuberRegressor(alpha=0.0001),
    cv=cv
)


selector = RFECV(HuberRegressor(alpha=0.0001), step=1, cv=cv)
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('selector', selector),
    ('model', stacking_model)

])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

cv_scores = cross_val_score(pipeline, X_train, y_train, cv=cv, scoring='neg_mean_absolute_error')


mean_cv_mae = -np.mean(cv_scores)
pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print(f'CV RMAE: {np.mean(-cv_scores):.4f}')
print(f'Test MAE: {mae:.4f}')
print(f'Test MSE: {mse:.4f}')
print(f'Test R^2 Score: {r2:.4f}')

In [10]:

model = HuberRegressor()

X = df.drop(columns=['id', 'Age'])
y = df['Age']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

cv = KFold(n_splits=5, shuffle=True, random_state=42)
cv_rmse = np.mean(-cross_val_score(model, X_train, y_train, cv=cv, scoring='neg_mean_absolute_error'))

y_pred = model.fit(X_train, y_train).predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)


print(f'Cross-validated RMSE: {cv_rmse:.4f}')
print(f'Test MAE: {mae:.4f}')
print(f'Test MSE: {mse:.4f}')
print(f'Test R^2 Score: {r2:.4f}')

Cross-validated RMSE: 1.3763
Test MAE: 1.3293
Test MSE: 3.9524
Test R^2 Score: 0.5869


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import Ridge, Lasso, HuberRegressor, TheilSenRegressor
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import StackingRegressor

train_df = pd.read_csv('train.csv')

def feature_funk(df):
    df['Surface Area'] = 2 * (df['Length'] * df['Diameter'] + df['Length'] * df['Height'] + df['Diameter'] * df['Height'])
    df['Volume'] = df['Length'] * df['Diameter'] * df['Height']
    df['Log Weight'] = df['Weight'].apply(lambda x: np.log(x + 1) if x > 0 else np.nan)
    df['Length Bins'] = pd.qcut(df['Length'], q=4, labels=False)

    df['Weight Reduction After Shucking'] = df['Weight'] - df['Shucked Weight']
    df['Viscera Weight Proportion'] = df['Viscera Weight'] / df['Weight']

    df['Length to Diameter Ratio'] = df['Length'] / df['Diameter']
    df['Height to Weight Ratio'] = df['Height'] / df['Weight']

    sex = {'I': 1, 'F': 3, 'M': 2}
    df['Sex'] = df['Sex'].replace(sex)
    df = df[df['Sex'].isin([1, 2, 3])]
    df['Sex'] = pd.to_numeric(df['Sex'])
    return df

train_df = feature_funk(train_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Sex'] = pd.to_numeric(df['Sex'])


In [None]:
file_path = 'train.csv'
train_df = pd.read_csv(file_path)

# Feature engineering function
def feature_funk(df):
    df['Surface Area'] = 2 * (df['Length'] * df['Diameter'] + df['Length'] * df['Height'] + df['Diameter'] * df['Height'])
    df['Volume'] = df['Length'] * df['Diameter'] * df['Height']
    df['Log Weight'] = df['Weight'].apply(lambda x: np.log(x + 1) if x > 0 else np.nan)
    df['Length Bins'] = pd.qcut(df['Length'], q=4, labels=False)

    df['Weight Reduction After Shucking'] = df['Weight'] - df['Shucked Weight']
    df['Viscera Weight Proportion'] = df['Viscera Weight'] / df['Weight']

    df['Length to Diameter Ratio'] = df['Length'] / df['Diameter']
    df['Height to Weight Ratio'] = df['Height'] / df['Weight']

    sex = {'I': 1, 'F': 3, 'M': 2}
    df['Sex'] = df['Sex'].replace(sex)
    df = df[df['Sex'].isin([1, 2, 3])]
    df.loc[:, 'Sex'] = pd.to_numeric(df['Sex'])
    return df

# Apply feature engineering function
train_df = feature_funk(train_df)

# Separate features and target variable
X = train_df.drop(columns=['id', 'Age'])
y = train_df['Age']

# Split the data into training (70%) and testing (30%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Identify categorical and numerical columns
categorical_cols = X_train.select_dtypes(include='object').columns.tolist()
numerical_cols = X_train.drop(columns=categorical_cols).columns.tolist()

# Updated numerical preprocessing pipeline with feature engineering
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # Initial imputation
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),
    ('log', FunctionTransformer(np.log1p, validate=False)),
    ('imputer2', SimpleImputer(strategy='mean')),s
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', drop='first'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_pipeline, numerical_cols),
        ('cat', cat_pipeline, categorical_cols)
    ])

base_models = [
    ('ridge', Ridge(alpha=1.0)),
    ('lasso', Lasso(alpha=0.01)),
    ('ths', TheilSenRegressor(fit_intercept=True, random_state=42))
]
meta_model = HuberRegressor()

stacking_regressor = StackingRegressor(
    estimators=base_models,
    final_estimator=meta_model,
    cv=5,
    n_jobs=-1
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('stacking', stacking_regressor)
])

cv = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=cv, scoring='neg_mean_absolute_error')
mean_cv_mae = -np.mean(cv_scores)
print(f'Cross-validated Mean Absolute Error (MAE) on training data: {mean_cv_mae:.4f}')


pipeline.fit(X_train, y_train)

test_score = pipeline.score(X_test, y_test)
print(f'Test R^2 Score: {test_score:.4f}')


KeyboardInterrupt: 