In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')

# Load the data
print("Loading data...")
train_df = pd.read_csv('/kaggle/input/playground-series-s6e1/train.csv')
test_df = pd.read_csv('/kaggle/input/playground-series-s6e1/test.csv')

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")
print("\nTrain columns:", train_df.columns.tolist())
print("\nFirst few rows:")
print(train_df.head())

# Separate target variable
X = train_df.drop(['exam_score'], axis=1) if 'exam_score' in train_df.columns else train_df
y = train_df['exam_score'] if 'exam_score' in train_df.columns else None

# Store test IDs
test_ids = test_df['id'] if 'id' in test_df.columns else test_df.index

# Feature Engineering Function
def engineer_features(df):
    """Create and transform features"""
    df = df.copy()
    
    # Handle categorical variables
    categorical_cols = df.select_dtypes(include=['object']).columns
    label_encoders = {}
    
    for col in categorical_cols:
        if col != 'id':
            le = LabelEncoder()
            df[col] = df[col].fillna('missing')
            df[col] = le.fit_transform(df[col].astype(str))
            label_encoders[col] = le
    
    # Handle numerical variables - fill missing with median
    numerical_cols = df.select_dtypes(include=[np.number]).columns
    for col in numerical_cols:
        if col not in ['id', 'exam_score']:
            df[col].fillna(df[col].median(), inplace=True)
    
    return df

# Apply feature engineering
print("\nEngineering features...")
X_processed = engineer_features(X)
test_processed = engineer_features(test_df)

# Remove ID column if present
if 'id' in X_processed.columns:
    X_processed = X_processed.drop('id', axis=1)
if 'id' in test_processed.columns:
    test_processed = test_processed.drop('id', axis=1)

# Align train and test features
common_features = X_processed.columns.intersection(test_processed.columns)
X_processed = X_processed[common_features]
test_processed = test_processed[common_features]

print(f"\nFeatures used: {len(common_features)}")
print(f"Feature names: {list(common_features)[:10]}...")

# Split data for validation
X_train, X_val, y_train, y_val = train_test_split(
    X_processed, y, test_size=0.2, random_state=42
)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(test_processed)

# Train multiple models and ensemble
print("\nTraining models...")

# Model 1: Random Forest
print("Training Random Forest...")
rf_model = RandomForestRegressor(
    n_estimators=200,
    max_depth=15,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)
rf_model.fit(X_train, y_train)
rf_pred_val = rf_model.predict(X_val)
rf_rmse = np.sqrt(mean_squared_error(y_val, rf_pred_val))
print(f"Random Forest Validation RMSE: {rf_rmse:.4f}")

# Model 2: Gradient Boosting
print("Training Gradient Boosting...")
gb_model = GradientBoostingRegressor(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=5,
    min_samples_split=5,
    random_state=42
)
gb_model.fit(X_train, y_train)
gb_pred_val = gb_model.predict(X_val)
gb_rmse = np.sqrt(mean_squared_error(y_val, gb_pred_val))
print(f"Gradient Boosting Validation RMSE: {gb_rmse:.4f}")

# Model 3: Ridge Regression
print("Training Ridge Regression...")
ridge_model = Ridge(alpha=10.0, random_state=42)
ridge_model.fit(X_train_scaled, y_train)
ridge_pred_val = ridge_model.predict(X_val_scaled)
ridge_rmse = np.sqrt(mean_squared_error(y_val, ridge_pred_val))
print(f"Ridge Regression Validation RMSE: {ridge_rmse:.4f}")

# Ensemble predictions (weighted average)
ensemble_pred_val = (0.4 * rf_pred_val + 0.4 * gb_pred_val + 0.2 * ridge_pred_val)
ensemble_rmse = np.sqrt(mean_squared_error(y_val, ensemble_pred_val))
print(f"\nEnsemble Validation RMSE: {ensemble_rmse:.4f}")

# Cross-validation for more robust estimate
print("\nPerforming cross-validation...")
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(
    rf_model, X_processed, y, 
    cv=kfold, 
    scoring='neg_mean_squared_error'
)
cv_rmse = np.sqrt(-cv_scores.mean())
print(f"Cross-Validation RMSE: {cv_rmse:.4f} (+/- {np.sqrt(-cv_scores).std():.4f})")

# Train final models on full dataset
print("\nTraining final models on full dataset...")
rf_model.fit(X_processed, y)
gb_model.fit(X_processed, y)
ridge_model.fit(scaler.fit_transform(X_processed), y)

# Make predictions on test set
print("Making predictions on test set...")
rf_pred_test = rf_model.predict(test_processed)
gb_pred_test = gb_model.predict(test_processed)
ridge_pred_test = ridge_model.predict(X_test_scaled)

# Ensemble predictions
final_predictions = (0.4 * rf_pred_test + 0.4 * gb_pred_test + 0.2 * ridge_pred_test)

# Ensure predictions are in reasonable range (adjust bounds as needed)
final_predictions = np.clip(final_predictions, 0, 100)

# Create submission file
submission = pd.DataFrame({
    'id': test_ids,
    'exam_score': final_predictions
})

submission.to_csv('submission.csv', index=False)
print("\nSubmission file created: submission.csv")
print(f"Number of predictions: {len(submission)}")
print("\nSample predictions:")
print(submission.head(10))
print("\nPrediction statistics:")
print(submission['exam_score'].describe())

# Feature importance (from Random Forest)
if len(common_features) <= 20:
    feature_importance = pd.DataFrame({
        'feature': common_features,
        'importance': rf_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print("\nTop feature importances:")
    print(feature_importance.head(10))

print("\n✓ Script completed successfully!")
print("Submit the 'submission.csv' file to Kaggle.")

Loading data...
Train shape: (630000, 13)
Test shape: (270000, 12)

Train columns: ['id', 'age', 'gender', 'course', 'study_hours', 'class_attendance', 'internet_access', 'sleep_hours', 'sleep_quality', 'study_method', 'facility_rating', 'exam_difficulty', 'exam_score']

First few rows:
   id  age  gender   course  study_hours  class_attendance internet_access  \
0   0   21  female     b.sc         7.91              98.8              no   
1   1   18   other  diploma         4.95              94.8             yes   
2   2   20  female     b.sc         4.68              92.6             yes   
3   3   19    male     b.sc         2.00              49.5             yes   
4   4   23    male      bca         7.65              86.9             yes   

   sleep_hours sleep_quality   study_method facility_rating exam_difficulty  \
0          4.9       average  online videos             low            easy   
1          4.7          poor     self-study          medium        moderate   
2     