In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

# Load datasets
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')

# Define features and target
features = [col for col in train_df.columns if col not in ['id', 'efficiency']]
target = 'efficiency'

# Data cleaning
def clean_data(df):
    df = df.copy()
    # Define numerical columns explicitly
    numerical_cols = ['temperature', 'irradiance', 'humidity', 'panel_age', 'maintenance_count',
                     'soiling_ratio', 'voltage', 'current', 'module_temperature', 'cloud_coverage',
                     'wind_speed', 'pressure']
    
    # Convert all numerical columns to numeric, handling non-numeric values
    for col in numerical_cols:
        df[col] = pd.to_numeric(df[col], errors='coerce')
    
    # Cap cloud_coverage at 100
    df['cloud_coverage'] = df['cloud_coverage'].clip(upper=100)
    
    # Ensure soiling_ratio is between 0 and 1
    df['soiling_ratio'] = df['soiling_ratio'].clip(0, 1)
    
    # Handle missing categorical values
    for col in ['error_code', 'installation_type']:
        df[col] = df[col].replace('', 'Unknown').fillna('Unknown')
    
    # Debugging: Check for non-numeric values in numerical columns
    for col in numerical_cols:
        non_numeric = df[col][df[col].apply(lambda x: isinstance(x, str))].unique()
        if len(non_numeric) > 0:
            print(f"Non-numeric values in {col}: {non_numeric}")
    
    return df

# Feature engineering
def engineer_features(df):
    df = df.copy()
    # Power output
    df['power_output'] = df['voltage'] * df['current']
    # Temperature difference
    df['temp_diff'] = df['module_temperature'] - df['temperature']
    # Degradation rate
    df['degradation_rate'] = df['panel_age'] / (df['maintenance_count'] + 1)
    # Error indicator
    df['has_error'] = df['error_code'].apply(lambda x: 0 if x == 'E00' else 1)
    # Soiling impact
    df['soiling_impact'] = 1 - df['soiling_ratio']
    # Effective irradiance
    df['effective_irradiance'] = df['irradiance'] * (1 - df['cloud_coverage'] / 100)
    
    # Handle NaN in engineered features
    engineered_cols = ['power_output', 'temp_diff', 'degradation_rate', 'soiling_impact', 'effective_irradiance']
    for col in engineered_cols:
        df[col] = pd.to_numeric(df[col], errors='coerce')
    
    return df

# Apply cleaning and feature engineering
train_df = clean_data(train_df)
test_df = clean_data(test_df)
train_df = engineer_features(train_df)
test_df = engineer_features(test_df)

# Define numerical and categorical columns
numerical_cols = ['temperature', 'irradiance', 'humidity', 'panel_age', 'maintenance_count',
                 'soiling_ratio', 'voltage', 'current', 'module_temperature', 'cloud_coverage',
                 'wind_speed', 'pressure', 'power_output', 'temp_diff', 'degradation_rate',
                 'soiling_impact', 'effective_irradiance']
categorical_cols = ['string_id', 'error_code', 'installation_type']

# Debugging: Verify numerical columns are numeric
for col in numerical_cols:
    if train_df[col].dtype not in ['int64', 'float64']:
        print(f"Column {col} is not numeric: {train_df[col].dtype}")
        print(f"Unique values in {col}: {train_df[col].unique()[:10]}")

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), numerical_cols),
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_cols)
    ])

# Define the model pipeline
model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', xgb.XGBRegressor(objective='reg:squarederror', random_state=42))
])

# Split training data
X = train_df[numerical_cols + categorical_cols]
y = train_df[target]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Evaluate on validation set
y_pred = model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
score = 100 * (1 - rmse)
print(f'Validation Score: {score:.4f}')

# Hyperparameter tuning
param_grid = {
    'regressor__n_estimators': [100, 200, 300],
    'regressor__max_depth': [3, 5, 7],
    'regressor__learning_rate': [0.01, 0.05, 0.1]
}
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best model
best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_val)
rmse_best = np.sqrt(mean_squared_error(y_val, y_pred_best))
score_best = 100 * (1 - rmse_best)
print(f'Best Validation Score: {score_best:.4f}')
print(f'Best Parameters: {grid_search.best_params_}')

# Generate predictions on test set
test_features = test_df[numerical_cols + categorical_cols]
test_predictions = best_model.predict(test_features)

# Prepare submission file
submission = pd.DataFrame({
    'id': test_df['id'],
    'efficiency': test_predictions
})

# Ensure submission matches required format
assert submission.shape == (12000, 2), f"Submission shape {submission.shape} does not match required (12000, 2)"
submission.to_csv('submission.csv', index=False)
print("Submission file saved as 'submission.csv'")


Validation Score: 88.7587
