In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

# Load and clean the training dataset
train_file = "CS98XRegressionTrain.csv"  # Ensure the correct file path
df = pd.read_csv(train_file).dropna()

# Define features and target variable
features = ['bpm', 'nrgy', 'dnce', 'dB', 'live', 'val', 'dur', 'acous', 'spch']
X = df[features]
y = df['pop']

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Use Polynomial Features to improve model performance
poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X_scaled)

# Train Ridge Regression (prevents overfitting)
model = Ridge(alpha=1.0)
model.fit(X_poly, y)

# Load the test dataset
test_file = "CS98XRegressionTest.csv"  # Ensure the correct file path
df_test = pd.read_csv(test_file)

# Prepare test data features
X_test = df_test[features]
X_test_scaled = scaler.transform(X_test)  # Use the same scaler as training data
X_test_poly = poly.transform(X_test_scaled)

# Predict on test data
df_test['pop'] = model.predict(X_test_poly)

# Retain only 'id' and 'pop' columns
df_test = df_test[['Id', 'pop']]

# Save predictions
df_test.to_csv("test_predictions_final.csv", index=False)

print("Predictions saved to 'test_predictions.csv'!")


Predictions saved to 'test_predictions.csv'!
