In [21]:
import pandas as pd

df=pd.read_csv('train.csv').drop(columns=['uid'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50400 entries, 0 to 50399
Data columns (total 13 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   day                            44921 non-null  object 
 1   hour                           44787 non-null  float64
 2   minute                         44930 non-null  float64
 3   C_motion                       44883 non-null  float64
 4   feed_water_motion              44803 non-null  float64
 5   faucet_hole                    44834 non-null  float64
 6   vapour_pressure                44921 non-null  float64
 7   vapour_enthalpy                44963 non-null  float64
 8   vapour_pressure_at_division    44923 non-null  float64
 9   vapour_motion                  44923 non-null  float64
 10  feed_water_enth                44904 non-null  float64
 11  vapour_temperature             44883 non-null  float64
 12  output_electricity_generation  50400 non-null 

In [22]:

day_column = df['day']
df_numeric = df.drop(columns=['day']).apply(pd.to_numeric, errors='coerce')
df_numeric = df_numeric.fillna(df_numeric.mean())
df_numeric['day'] = day_column
df = df_numeric
df['day'] = df['day'].fillna(df['day'].mode()[0])
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50400 entries, 0 to 50399
Data columns (total 13 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   hour                           50400 non-null  float64
 1   minute                         50400 non-null  float64
 2   C_motion                       50400 non-null  float64
 3   feed_water_motion              50400 non-null  float64
 4   faucet_hole                    50400 non-null  float64
 5   vapour_pressure                50400 non-null  float64
 6   vapour_enthalpy                50400 non-null  float64
 7   vapour_pressure_at_division    50400 non-null  float64
 8   vapour_motion                  50400 non-null  float64
 9   feed_water_enth                50400 non-null  float64
 10  vapour_temperature             50400 non-null  float64
 11  output_electricity_generation  50400 non-null  float64
 12  day                            50400 non-null 

In [24]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
# Separate features and target
X_train = df.drop(columns=["output_electricity_generation"])
y_train = df["output_electricity_generation"]


# Identify categorical and numerical columns
categorical_cols = X_train.select_dtypes(include=["object", "category"]).columns.tolist()
numerical_cols = X_train.select_dtypes(include=["number"]).columns.tolist()

# Create a preprocessor to one-hot encode categorical columns
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
        ("num", "passthrough", numerical_cols)
    ]
)

# Fit preprocessor on training data and transform both datasets
X_train_preprocessed = preprocessor.fit_transform(X_train)

# Convert to DataFrames with proper column names
cat_encoder = preprocessor.named_transformers_["cat"]
cat_features = cat_encoder.get_feature_names_out(categorical_cols)
all_features = list(cat_features) + numerical_cols

X_train_preprocessed_df = pd.DataFrame(X_train_preprocessed, columns=all_features)
X_train_preprocessed_df

Unnamed: 0,day_Friday,day_Saturday,hour,minute,C_motion,feed_water_motion,faucet_hole,vapour_pressure,vapour_enthalpy,vapour_pressure_at_division,vapour_motion,feed_water_enth,vapour_temperature
0,0.0,1.0,7.000000,29.232028,387.374315,2681.205431,0.678352,25.050893,2775.784134,26.465138,2682.758692,1289.082351,601.850322
1,0.0,1.0,12.000000,28.000000,238.153011,1604.213100,0.679706,15.241147,2903.619864,23.010401,1602.318002,1144.454102,603.933579
2,1.0,0.0,15.000000,13.000000,338.270228,2398.561685,0.662971,23.078376,2796.443584,24.316160,2310.108638,1258.098883,602.443281
3,0.0,1.0,14.000000,29.232028,254.294492,1620.034059,0.679977,15.054982,2818.023511,15.895513,1619.387667,1142.463833,588.938717
4,0.0,1.0,18.000000,16.000000,203.189410,1485.672389,0.686342,12.963420,2816.144355,13.682371,1487.685052,1099.539481,504.208816
...,...,...,...,...,...,...,...,...,...,...,...,...,...
50395,0.0,1.0,3.000000,53.000000,358.326096,2466.188446,0.679246,23.079785,2795.566204,24.382871,2463.927286,1263.867952,592.742441
50396,0.0,1.0,18.000000,6.000000,205.657731,1520.898592,0.683778,13.324238,2833.890795,14.059122,2310.108638,1107.134960,506.487329
50397,1.0,0.0,11.886507,4.000000,356.572958,2432.402583,0.678961,22.805262,2784.953650,24.089748,2310.108638,1258.890934,592.742441
50398,1.0,0.0,17.000000,23.000000,344.533133,2447.181803,0.661900,23.493103,2803.209426,23.010401,2445.952298,1264.949843,603.189838


In [25]:
X_train_preprocessed_df.columns

Index(['day_Friday', 'day_Saturday', 'hour', 'minute', 'C_motion',
       'feed_water_motion', 'faucet_hole', 'vapour_pressure',
       'vapour_enthalpy', 'vapour_pressure_at_division', 'vapour_motion',
       'feed_water_enth', 'vapour_temperature'],
      dtype='object')

In [28]:
from autofeat import AutoFeatModel


# Initialize AutoFeatModel (customize parameters if needed)
model = AutoFeatModel(
    feateng_steps=2,  # Number of feature engineering steps (default: 2)
    verbose=1,
    feateng_cols=['hour', 'minute', 'C_motion',
       'feed_water_motion', 'faucet_hole', 'vapour_pressure',
       'vapour_enthalpy', 'vapour_pressure_at_division', 'vapour_motion',
       'feed_water_enth', 'vapour_temperature']
)

# Fit on training data and generate new features
X_train_new = model.fit_transform(X_train_preprocessed_df, y_train)



2025-03-16 10:33:13,377 INFO: [AutoFeat] The 2 step feature engineering process could generate up to 3003 features.
2025-03-16 10:33:13,377 INFO: [AutoFeat] With 50400 data points this new feature matrix would use about 0.61 gb of space.
2025-03-16 10:33:13,381 INFO: [feateng] Step 1: transformation of original features


[feateng]               0/             11 features transformed

2025-03-16 10:33:14,635 INFO: [feateng] Generated 48 transformed features from 11 original features - done.
2025-03-16 10:33:14,645 INFO: [feateng] Step 2: first combination of features


[feateng]            1200/           1711 feature tuples combined

  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


[feateng]            1500/           1711 feature tuples combined

2025-03-16 10:33:16,671 INFO: [feateng] Generated 1675 feature combinations from 1711 original feature tuples - done.


[feateng]            1700/           1711 feature tuples combined

2025-03-16 10:33:17,225 INFO: [feateng] Generated altogether 1734 new features in 2 steps
2025-03-16 10:33:17,226 INFO: [feateng] Removing correlated features, as well as additions at the highest level
2025-03-16 10:33:17,731 INFO: [feateng] Generated a total of 549 additional features
2025-03-16 10:33:17,919 INFO: [featsel] Feature selection run 1/5


[featsel] Scaling data...done.


2025-03-16 10:33:23,810 INFO: [featsel] Feature selection run 2/5
2025-03-16 10:33:29,069 INFO: [featsel] Feature selection run 3/5
2025-03-16 10:33:32,926 INFO: [featsel] Feature selection run 4/5
2025-03-16 10:33:36,150 INFO: [featsel] Feature selection run 5/5
2025-03-16 10:33:40,393 INFO: [featsel] 55 features after 5 feature selection runs
  if np.max(np.abs(correlations[c].ravel()[:i])) < 0.9:
2025-03-16 10:33:40,952 INFO: [featsel] 44 features after correlation filtering
2025-03-16 10:33:42,115 INFO: [featsel] 43 features after noise filtering
2025-03-16 10:33:42,118 INFO: [AutoFeat] Computing 39 new features.


[AutoFeat]    38/   39 new features

2025-03-16 10:33:46,751 INFO: [AutoFeat]    39/   39 new features ...done.
2025-03-16 10:33:46,762 INFO: [AutoFeat] Final dataframe with 52 feature columns (39 new).
2025-03-16 10:33:46,763 INFO: [AutoFeat] Training final regression model.
2025-03-16 10:33:46,939 INFO: [AutoFeat] Trained model: largest coefficients:
2025-03-16 10:33:46,939 INFO: 517.6524979939186
2025-03-16 10:33:46,940 INFO: -0.000435 * feed_water_enth**2/vapour_pressure
2025-03-16 10:33:46,940 INFO: 0.000240 * feed_water_enth*hour
2025-03-16 10:33:46,941 INFO: -0.000021 * vapour_temperature**3/vapour_pressure_at_division
2025-03-16 10:33:46,941 INFO: -0.000016 * vapour_temperature**3/vapour_pressure
2025-03-16 10:33:46,945 INFO: [AutoFeat] Final score: 0.9401


In [30]:
X_train=X_train_new.copy()
X_train

Unnamed: 0,day_Friday,day_Saturday,hour,minute,C_motion,feed_water_motion,faucet_hole,vapour_pressure,vapour_enthalpy,vapour_pressure_at_division,...,vapour_pressure/vapour_motion,C_motion**3*feed_water_enth**3,feed_water_enth**2/vapour_motion,feed_water_enth**2/vapour_pressure,feed_water_enth**3*vapour_motion**2,sqrt(vapour_motion)/vapour_pressure,sqrt(feed_water_enth)/vapour_motion,sqrt(feed_water_enth)/vapour_temperature,C_motion**2*vapour_pressure_at_division**2,sqrt(vapour_pressure)/vapour_pressure_at_division
0,0.0,1.0,7.000000,29.232028,387.374315,2681.205431,0.678352,25.050893,2775.784134,26.465138,...,0.009338,1.245187e+17,619.412142,66334.294563,1.541719e+16,2.067605,0.013383,0.059656,1.051018e+08,0.189120
1,0.0,1.0,12.000000,28.000000,238.153011,1604.213100,0.679706,15.241147,2903.619864,23.010401,...,0.009512,2.024713e+16,817.425248,85936.786640,3.848510e+15,2.626375,0.021113,0.056016,3.003036e+07,0.169662
2,1.0,0.0,15.000000,13.000000,338.270228,2398.561685,0.662971,23.078376,2796.443584,24.316160,...,0.009990,7.707893e+16,685.168123,68584.235628,1.062696e+16,2.082624,0.015354,0.058876,6.765775e+07,0.197564
3,0.0,1.0,14.000000,29.232028,254.294492,1620.034059,0.679977,15.054982,2818.023511,15.895513,...,0.009297,2.452100e+16,805.998240,86697.122163,3.910471e+15,2.672977,0.020872,0.057392,1.633891e+07,0.244099
4,0.0,1.0,18.000000,16.000000,203.189410,1485.672389,0.686342,12.963420,2816.144355,13.682371,...,0.008714,1.115156e+16,812.663318,93261.425344,2.942080e+15,2.975335,0.022289,0.065765,7.729027e+06,0.263147
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50395,0.0,1.0,3.000000,53.000000,358.326096,2466.188446,0.679246,23.079785,2795.566204,24.382871,...,0.009367,9.288389e+16,648.299246,69210.446276,1.225634e+16,2.150712,0.014429,0.059977,7.633550e+07,0.197029
50396,0.0,1.0,18.000000,6.000000,205.657731,1520.898592,0.683778,13.324238,2833.890795,14.059122,...,0.005768,1.180421e+16,530.601808,91993.840565,7.242133e+15,3.607230,0.014403,0.065695,8.360003e+06,0.259635
50397,1.0,0.0,11.886507,4.000000,356.572958,2432.402583,0.678961,22.805262,2784.953650,24.089748,...,0.009872,9.045020e+16,686.031106,69493.012232,1.064705e+16,2.107566,0.015359,0.059859,7.378385e+07,0.198237
50398,1.0,0.0,17.000000,23.000000,344.533133,2447.181803,0.661900,23.493103,2803.209426,23.010401,...,0.009605,8.277761e+16,654.182057,68109.271504,1.210921e+16,2.105153,0.014541,0.058963,6.285074e+07,0.210642


In [36]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_regression, mutual_info_regression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

# Assuming X_train and y_train are already preprocessed
# X_train: (n_samples, 52), y_train: (n_samples,)

# Split data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Baseline model (using all features)
baseline_model = LinearRegression()
baseline_model.fit(X_train, y_train)
y_pred = baseline_model.predict(X_val)
baseline_rmse = np.sqrt(mean_squared_error(y_val, y_pred))
baseline_r2 = r2_score(y_val, y_pred)

print(f"Baseline RMSE: {baseline_rmse:.4f}, R²: {baseline_r2:.4f}")

# Feature selection methods
def evaluate_feature_selection(method_name, X_train_selected, X_val_selected):
    model = LinearRegression()
    model.fit(X_train_selected, y_train)
    y_pred = model.predict(X_val_selected)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    r2 = r2_score(y_val, y_pred)
    print(f"{method_name} - RMSE: {rmse:.4f}, R²: {r2:.4f}")
    return rmse, r2

# 1. Variance Thresholding
variance_selector = VarianceThreshold(threshold=0.01)  # Adjust threshold as needed
X_train_var = variance_selector.fit_transform(X_train)
X_val_var = variance_selector.transform(X_val)
print(f"Variance Thresholding selected {X_train_var.shape[1]} features")
if X_train_var.shape[1] >= 10:
    evaluate_feature_selection("Variance Thresholding", X_train_var, X_val_var)

# 2. Pearson's Correlation
pearson_selector = SelectKBest(score_func=f_regression, k=10)
X_train_pearson = pearson_selector.fit_transform(X_train, y_train)
X_val_pearson = pearson_selector.transform(X_val)
evaluate_feature_selection("Pearson's Correlation", X_train_pearson, X_val_pearson)

# 3. ANOVA F-test
anova_selector = SelectKBest(score_func=f_regression, k=10)
X_train_anova = anova_selector.fit_transform(X_train, y_train)
X_val_anova = anova_selector.transform(X_val)
evaluate_feature_selection("ANOVA F-test", X_train_anova, X_val_anova)

# 4. Mutual Information
mi_selector = SelectKBest(score_func=mutual_info_regression, k=10)
X_train_mi = mi_selector.fit_transform(X_train, y_train)
X_val_mi = mi_selector.transform(X_val)
evaluate_feature_selection("Mutual Information", X_train_mi, X_val_mi)

# 5. Feature Importance (Random Forest)
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)
importances = rf.feature_importances_
top_10_indices = np.argsort(importances)[-10:]
X_train_rf = X_train[:, top_10_indices]
X_val_rf = X_val[:, top_10_indices]
evaluate_feature_selection("Feature Importance (RF)", X_train_rf, X_val_rf)

Baseline RMSE: 70.7760, R²: 0.8399
Variance Thresholding selected 35 features
Variance Thresholding - RMSE: 70.7760, R²: 0.8399
Pearson's Correlation - RMSE: 32.1076, R²: 0.9671
ANOVA F-test - RMSE: 32.1076, R²: 0.9671
Mutual Information - RMSE: 46.5727, R²: 0.9307
Feature Importance (RF) - RMSE: 71.0940, R²: 0.8385


(np.float64(71.0939902283128), 0.8384631755615306)