In [2]:
import pandas as pd

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
#sample_submission = pd.read_csv('sample_submission.csv')

# 📊 Check data
print(train.shape, test.shape)
print(train.columns)

(20000, 17) (12000, 16)
Index(['id', 'temperature', 'irradiance', 'humidity', 'panel_age',
       'maintenance_count', 'soiling_ratio', 'voltage', 'current',
       'module_temperature', 'cloud_coverage', 'wind_speed', 'pressure',
       'string_id', 'error_code', 'installation_type', 'efficiency'],
      dtype='object')


In [3]:
train.head()


Unnamed: 0,id,temperature,irradiance,humidity,panel_age,maintenance_count,soiling_ratio,voltage,current,module_temperature,cloud_coverage,wind_speed,pressure,string_id,error_code,installation_type,efficiency
0,0,7.817315,576.17927,41.24308670850264,32.135501,4.0,0.803199,37.403527,1.963787,13.691147,62.494044,12.82491203459621,1018.8665053152532,A1,,,0.562096
1,1,24.785727,240.003973,1.3596482765960705,19.97746,8.0,0.479456,21.843315,0.241473,27.545096,43.851238,12.012043660984917,1025.6238537572883,D4,E00,dual-axis,0.396447
2,2,46.652695,687.612799,91.26536837560256,1.496401,4.0,0.822398,48.222882,4.1918,43.363708,,1.814399755560454,1010.9226539809572,C3,E00,,0.573776
3,3,53.339567,735.141179,96.1909552117616,18.491582,3.0,0.837529,46.295748,0.960567,57.720436,67.361473,8.736258932034128,1021.8466633134252,A1,,dual-axis,0.629009
4,4,5.575374,12.241203,27.495073003585222,30.722697,6.0,0.551833,0.0,0.898062,6.786263,3.632,0.52268384077164,1008.5559577591928,B2,E00,fixed,0.341874


In [4]:
# 🧹 Preprocessing - Handle categorical variables
from sklearn.preprocessing import LabelEncoder

cat_cols = ['string_id', 'error_code', 'installation_type']
for col in cat_cols:
    le = LabelEncoder()
    train[col] = le.fit_transform(train[col])
    test[col] = le.fit_transform(test[col])


In [5]:
from sklearn.model_selection import KFold

def target_encode(train, test, column, target):
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    global_mean = train[target].mean()
    train_encoded = pd.Series(index=train.index, dtype=float)

    for train_idx, val_idx in kf.split(train):
        fold_train, fold_val = train.iloc[train_idx], train.iloc[val_idx]
        means = fold_train.groupby(column)[target].mean()
        train_encoded.iloc[val_idx] = fold_val[column].map(means).fillna(global_mean)

    test_encoded = test[column].map(train.groupby(column)[target].mean()).fillna(global_mean)
    
    return train_encoded, test_encoded

for col in ['string_id', 'error_code', 'installation_type']:
    train[col + '_te'], test[col + '_te'] = target_encode(train, test, col, 'efficiency')


In [6]:
# ✅ Convert numeric columns to float (fix for multiplication error)
numeric_cols = [
    'temperature', 'irradiance', 'humidity', 'panel_age', 'maintenance_count',
    'soiling_ratio', 'voltage', 'current', 'module_temperature', 'cloud_coverage',
    'wind_speed', 'pressure'
]

for col in numeric_cols:
    train[col] = pd.to_numeric(train[col], errors='coerce')
    test[col] = pd.to_numeric(test[col], errors='coerce')


In [7]:
from sklearn.experimental import enable_iterative_imputer # noqa
from sklearn.impute import IterativeImputer

def impute_data(df):
    df = df.copy()
    # Handle missing values using Iterative Imputer
    #if 'target' in df.columns:
    if 'efficiency' in df.columns:
        df = df.drop(columns=['efficiency'])  # Drop target column for imputation
        
    imputer = IterativeImputer(max_iter=10, random_state=42)
    df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)
    return df_imputed

train = impute_data(train)
test = impute_data(test)

In [8]:
train.columns
test.columns

Index(['id', 'temperature', 'irradiance', 'humidity', 'panel_age',
       'maintenance_count', 'soiling_ratio', 'voltage', 'current',
       'module_temperature', 'cloud_coverage', 'wind_speed', 'pressure',
       'string_id', 'error_code', 'installation_type', 'string_id_te',
       'error_code_te', 'installation_type_te'],
      dtype='object')

In [9]:
import numpy as np
# add new feature 
train['irradiance_squared'] = train['irradiance'] ** 2
test['irradiance_squared'] = test['irradiance'] ** 2

train['temperature_squared'] = train['temperature'] ** 2
test['temperature_squared'] = test['temperature'] ** 2

train['Power'] = train['voltage'] * train['current']
test['Power'] = test['voltage'] * test['current']

train['efficiency_ratio'] = train['Power'] / train['irradiance']
test['efficiency_ratio'] = test['Power'] / test['irradiance']

train['weather_impact'] = train['cloud_coverage'] * train['wind_speed'] / train['pressure']
test['weather_impact'] = test['cloud_coverage'] * test['wind_speed'] / test['pressure']

train['maintenance_effectiveness'] = train['maintenance_count'] / (train['panel_age'] * train['soiling_ratio'])
test['maintenance_effectiveness'] = test['maintenance_count'] / (test['panel_age'] * test['soiling_ratio'])

train['error_severity'] = train['error_code'] * train['string_id']
test['error_severity'] = test['error_code'] * test['string_id']


train['energy_efficiency'] = train['Power'] / (train['irradiance'] * train['panel_age'])
test['energy_efficiency'] = test['Power'] / (test['irradiance'] * test['panel_age'])

train['maintenance_score'] = train['maintenance_count'] / (train['panel_age'] + 1)
test['maintenance_score'] = test['maintenance_count'] / (test['panel_age'] + 1)

train['error_impact'] = train['error_code'] * train['string_id'] / (train['panel_age'] + 1)
test['error_impact'] = test['error_code'] * test['string_id'] / (test['panel_age'] + 1)


#### remove if acc falls
train['log_irradiance'] = np.log1p(train['irradiance'])
test['log_irradiance'] = np.log1p(test['irradiance'])

train['log_temperature'] = np.log1p(train['temperature'])
test['log_temperature'] = np.log1p(test['temperature'])

train['log_power'] = np.log1p(train['Power'])
test['log_power'] = np.log1p(test['Power'])

train['output_efficiency'] = train['efficiency_ratio'] * train['weather_impact']
test['output_efficiency'] = test['efficiency_ratio'] * test['weather_impact']

train['grid_stability'] = train['voltage'] * train['current'] / (train['panel_age'] + 1)
test['grid_stability'] = test['voltage'] * test['current'] / (test['panel_age'] + 1)


train['panel_health_index'] = train['Power'] / (train['panel_age'] * train['soiling_ratio'])
test['panel_health_index'] = test['Power'] / (test['panel_age'] * test['soiling_ratio'])

train['error_correction_factor'] = train['error_code'] / (train['panel_age'] + 1)
test['error_correction_factor'] = test['error_code'] / (test['panel_age'] + 1)




  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


In [10]:
import numpy as np
from sklearn.model_selection import train_test_split

In [11]:
train = train.drop(columns=['id'])
test = test.drop(columns=['id'])

In [12]:
# 🎯 Target & Features
new_df = pd.read_csv('train.csv')
eff = new_df['efficiency']
X = train
y = eff
X_test = test


In [13]:
# Check for infinity in test set as well
inf_cols_test = X_test.columns[np.isinf(X_test).any()].tolist()

print("Columns with infinity values in test:")
for col in inf_cols_test:
    print(f"- {col}")


Columns with infinity values in test:


In [14]:
# ⚖️ Feature Scaling (optional with LGBM but improves stability)
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)


In [15]:
# 🔀 Train-validation split
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [17]:
#combing results to get better predictions
tusk_df = pd.read_csv('FINAL_STACK_test_predictions_stacked.csv')
aayu_df = pd.read_csv('submission_enriched_stackedv2.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'FINAL_STACK_test_predictions_stacked.csv'

In [18]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_selection import SelectFromModel
import xgboost as xgb

# 1. Prepare Data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 2. Train a Base XGBoost to Get Feature Importance
base_model = xgb.XGBRegressor(n_estimators=200, max_depth=4, learning_rate=0.05,
                              subsample=0.8, colsample_bytree=0.8,
                              reg_alpha=1.0, reg_lambda=1.0, n_jobs=-1, random_state=42)
base_model.fit(X_train, y_train)

# 3. Select Top 10 Features
importances = base_model.feature_importances_
top10_idx = np.argsort(importances)[-13:]
top10_features = X.columns[top10_idx]

X_train_sel = X_train[top10_features]
X_val_sel = X_val[top10_features]

# 4. Train Final Model with Top 10 Features
xgb_final_model = xgb.XGBRegressor(n_estimators=300, max_depth=3, learning_rate=0.03,
                                subsample=0.7, colsample_bytree=0.7,
                                reg_alpha=0.5, reg_lambda=0.5, n_jobs=-1, random_state=42)
xgb_final_model.fit(X_train_sel, y_train)

# 5. Evaluate
preds = xgb_final_model.predict(X_val_sel)
rmse = np.sqrt(mean_squared_error(y_val, preds))
r2 = r2_score(y_val, preds)

print("✅ Final RMSE:", round(rmse, 4))
print("✅ R² Score:", round(r2, 4))
print("📌 Top 10 Features Used:", list(top10_features))


✅ Final RMSE: 0.1056
✅ R² Score: 0.4451
📌 Top 10 Features Used: ['error_severity', 'temperature', 'module_temperature', 'temperature_squared', 'humidity', 'panel_health_index', 'maintenance_score', 'grid_stability', 'panel_age', 'soiling_ratio', 'log_irradiance', 'irradiance', 'irradiance_squared']


In [None]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    'n_estimators': [100, 200, 300, 400],
    'max_depth': [3, 4, 5, 6],
    'learning_rate': [0.01, 0.03, 0.05, 0.1],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9],
    'reg_alpha': [0, 0.5, 1.0],
    'reg_lambda': [0, 0.5, 1.0]
}

# Initialize XGBoost model
xgb_model = xgb.XGBRegressor(n_jobs=-1, random_state=42)

# Grid Search
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid,
                           cv=5, scoring='neg_root_mean_squared_error', 
                           verbose=1, n_jobs=-1)

# Fit grid search
grid_search.fit(X_train_sel, y_train)

# Best parameters
print("Best Parameters:", grid_search.best_params_)
print("Best RMSE:", -grid_search.best_score_)

Fitting 5 folds for each of 5184 candidates, totalling 25920 fits
Best Parameters: {'colsample_bytree': 0.7, 'learning_rate': 0.03, 'max_depth': 3, 'n_estimators': 300, 'reg_alpha': 0.5, 'reg_lambda': 0.5, 'subsample': 0.7}
Best RMSE: 0.10167348038030266


In [None]:

test_id = pd.read_csv('test.csv')
#xgb _final_preds = xgb_final_model.predict(test[top10_features])
xgb_final_preds = xgb_final_model.predict(test[top10_features])
# 📝 Prepare XGBoost submissio

# ✅ Create submission file
xgb_submission = pd.DataFrame({
    'id': test_id['id'],
    'efficiency': xgb_final_preds
})
xgb_submission.to_csv('xgb_submission.csv', index=False)
print("✅ XGBoost submission file created: xgb_submission.csv")


✅ XGBoost submission file created: xgb_submission.csv


In [None]:
# combined lgb and xgb predictions
combined_preds = (tusk_df['efficiency']*0.4 + xgb_final_preds*0.6) 
# Create finl combined submission
combined_submission = pd.DataFrame({
    'id': test_id['id'],
    'efficiency': combined_preds
})
combined_submission.to_csv('combined_submission_lgb_xgb_auto.csv', index=False) 

#### Best performed model yet 89.90413

In [None]:
# combined with xgb model and aayush model
# combine xgb with aayush model
aayu_df = pd.read_csv('submission_enriched_stackedv2.csv')
combined_aayu_xgb = (aayu_df['efficiency']*0.4 + xgb_final_preds*0.6)
# Create final combined submission with Aayu's model
aayu_combined_submission = pd.DataFrame({
    'id': test_id['id'],
    'efficiency': combined_aayu_xgb
})
aayu_combined_submission.to_csv('aayu_combined_submission_xgb.csv', index=False)
# Final combined submission with Tusk's model
tusk_df = pd.read_csv('FINAL_STACK_test_predictions_stacked.csv')
combined_tusk_aayu_xgb = (tusk_df['efficiency']*0.25 + aayu_combined_submission['efficiency']*0.25 + xgb_final_preds*0.5)
# Create final combined submission with Tusk's model and Aayu's model
final_combined_submission = pd.DataFrame({
    'id': test_id['id'],
    'efficiency': combined_tusk_aayu_xgb
})
final_combined_submission.to_csv('final_combined_submission_tusk_aayu_xgb.csv', index=False)
# Final combined submission with Tusk's model and Aayu's model
print("✅ Final combined submission file created: final_combined_submission_tusk_aayu_xgb.csv")

✅ Final combined submission file created: final_combined_submission_tusk_aayu_xgb.csv
