In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [3]:
X = train.drop(['efficiency', 'id'], axis=1)
y = train['efficiency']
X_test_final = test.drop('id', axis=1)
test_ids = test['id']

In [4]:
cols_to_convert = ['humidity', 'wind_speed', 'pressure']
for col in cols_to_convert:
    X[col] = pd.to_numeric(X[col], errors='coerce')
    X_test_final[col] = pd.to_numeric(X_test_final[col], errors='coerce')

In [6]:
# Imputation
num_imputer = SimpleImputer(strategy='mean')
cat_imputer = SimpleImputer(strategy='constant', fill_value='Unknown')

num_cols = X.select_dtypes(include=['int64', 'float64']).columns
cat_cols = X.select_dtypes(include=['object']).columns

X[num_cols] = num_imputer.fit_transform(X[num_cols])
X_test_final[num_cols] = num_imputer.transform(X_test_final[num_cols])

X[cat_cols] = cat_imputer.fit_transform(X[cat_cols])
X_test_final[cat_cols] = cat_imputer.transform(X_test_final[cat_cols])

In [7]:
# Label encode categorical features
for col in cat_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    X_test_final[col] = le.transform(X_test_final[col])

In [8]:
# Feature Engineering
X['temp_diff'] = X['module_temperature'] - X['temperature']
X_test_final['temp_diff'] = X_test_final['module_temperature'] - X_test_final['temperature']

X['irradiance_per_humidity'] = X['irradiance'] / (X['humidity'] + 1)
X_test_final['irradiance_per_humidity'] = X_test_final['irradiance'] / (X_test_final['humidity'] + 1)

X['age_squared'] = X['panel_age'] ** 2
X_test_final['age_squared'] = X_test_final['panel_age'] ** 2

X['wind_pressure_ratio'] = X['wind_speed'] / (X['pressure'] + 1)
X_test_final['wind_pressure_ratio'] = X_test_final['wind_speed'] / (X_test_final['pressure'] + 1)

X['cloud_irradiance'] = X['cloud_coverage'] * X['irradiance']
X_test_final['cloud_irradiance'] = X_test_final['cloud_coverage'] * X_test_final['irradiance']


In [9]:
# Split dataset
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X, y, test_size=0.2, random_state=42)