In [125]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.feature_selection import RFE
from sklearn.linear_model import Lasso

In [126]:
# Load the training data
train_data = pd.read_csv('CS98XRegressionTrain.csv')

In [127]:
train_data.head()

Unnamed: 0,Id,title,artist,top genre,year,bpm,nrgy,dnce,dB,live,val,dur,acous,spch,pop
0,1,My Happiness,Connie Francis,adult standards,1996,107,31,45,-8,13,28,150,75,3,44
1,2,Unchained Melody,The Teddy Bears,,2011,114,44,53,-8,13,47,139,49,3,37
2,3,How Deep Is Your Love,Bee Gees,adult standards,1979,105,36,63,-9,13,67,245,11,3,77
3,4,Woman in Love,Barbra Streisand,adult standards,1980,170,28,47,-16,13,33,232,25,3,67
4,5,Goodbye Yellow Brick Road - Remastered 2014,Elton John,glam rock,1973,121,47,56,-8,15,40,193,45,3,63


In [128]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 453 entries, 0 to 452
Data columns (total 15 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Id         453 non-null    int64 
 1   title      453 non-null    object
 2   artist     453 non-null    object
 3   top genre  438 non-null    object
 4   year       453 non-null    int64 
 5   bpm        453 non-null    int64 
 6   nrgy       453 non-null    int64 
 7   dnce       453 non-null    int64 
 8   dB         453 non-null    int64 
 9   live       453 non-null    int64 
 10  val        453 non-null    int64 
 11  dur        453 non-null    int64 
 12  acous      453 non-null    int64 
 13  spch       453 non-null    int64 
 14  pop        453 non-null    int64 
dtypes: int64(12), object(3)
memory usage: 53.2+ KB


In [129]:
train_data["top genre"].value_counts()

top genre
adult standards       68
album rock            66
dance pop             61
brill building pop    16
glam rock             16
                      ..
bow pop                1
australian rock        1
boogaloo               1
british comedy         1
alternative rock       1
Name: count, Length: 86, dtype: int64

In [130]:
missing_rows = train_data[train_data.isna().any(axis=1)] #printing the missing values row
print(missing_rows)

      Id                         title                            artist  \
1      2              Unchained Melody                   The Teddy Bears   
8      9           Someone Elses Roses                        Joan Regan   
26    27           Drinks On The House                Green Martini Keys   
72    73                   Pachuko Hop           Ike Carpenter Orchestra   
90    91     Little Things Means A Lot                   Jayne Mansfield   
98    99           The Lady Is A Tramp         Mel Torme & the Mel-Tones   
220  221     If I Give My Heart To You                   The Teddy Bears   
252  253  Happy Days And Lonely Nights  Suzi Miller And Johnson Brothers   
265  266              Stairway Of Love                        Terry Dene   
266  267                           You                         Ten Sharp   
314  315                 No Other Love          Bob Sharples & His Music   
330  331           I've Waited So Long                    Anthony Newley   
362  363    

In [131]:
train_data_cleaned = train_data.dropna() #removed missing values rows and created new csv file storing the remaining rows 
train_data_cleaned.to_csv("cleaned_file.csv", index=False)

In [132]:
# Load the cleaned training data
cleaned_data = pd.read_csv('cleaned_file.csv')


In [133]:
# Load the test data
test_data = pd.read_csv('CS98XRegressionTest.csv')

In [134]:
# Define features (X) and target (y) for training
X_train = cleaned_data[['bpm', 'nrgy', 'dnce', 'dB', 'live', 'val', 'dur', 'acous', 'spch']]
y_train = cleaned_data['pop']

In [135]:
# Define features for testing
X_test = test_data[['bpm', 'nrgy', 'dnce', 'dB', 'live', 'val', 'dur', 'acous', 'spch']]

In [136]:
# Feature Engineering: Add new interaction and polynomial features
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_train_poly = poly.fit_transform(cleaned_data[['bpm', 'nrgy', 'dnce', 'dB', 'live', 'val', 'dur', 'acous', 'spch']])
X_test_poly = poly.transform(test_data[['bpm', 'nrgy', 'dnce', 'dB', 'live', 'val', 'dur', 'acous', 'spch']])


In [137]:
# Standardization
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_poly)
X_test_scaled = scaler.transform(X_test_poly)

In [138]:
# Define target (y) for training
y_train = cleaned_data['pop']

In [139]:
# Split the training data into training and validation sets
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42
)

In [140]:
# Train Ridge Regression
ridge = Ridge(alpha=1.0)
ridge.fit(X_train_split, y_train_split)

In [141]:
# Initialize models
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "XGBoost": xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
}


In [142]:
# Train and evaluate each model
for name, model in models.items():
    print(f"Training {name}...")
    
    # Train the model
    model.fit(X_train_split, y_train_split)
    
    # Predict on the validation set
    y_val_pred = model.predict(X_val_split)
    
    # Evaluate the model
    mse = mean_squared_error(y_val_split, y_val_pred)
    r2 = r2_score(y_val_split, y_val_pred)
    
    print(f"{name} - Validation MSE: {mse:.4f}, R²: {r2:.4f}")
    print("-" * 50)

Training Linear Regression...
Linear Regression - Validation MSE: 93.1013, R²: 0.3721
--------------------------------------------------
Training Random Forest...


Random Forest - Validation MSE: 96.4691, R²: 0.3494
--------------------------------------------------
Training XGBoost...
XGBoost - Validation MSE: 108.2198, R²: 0.2702
--------------------------------------------------


In [143]:
# Choose the best model (Linear Regression)
best_model = models["Linear Regression"]

In [144]:
# Feature Selection using Recursive Feature Elimination (RFE)
print("Performing Feature Selection using RFE...")
rfe = RFE(estimator=best_model, n_features_to_select=15)  # Select top 15 features
X_train_selected = rfe.fit_transform(X_train_scaled, y_train)
X_test_selected = rfe.transform(X_test_scaled)


Performing Feature Selection using RFE...


In [145]:
# Train the best model on the selected features
best_model.fit(X_train_selected, y_train)

In [146]:
# Predict the popularity for the test data
y_pred = best_model.predict(X_test_selected)

In [147]:
# Create a new DataFrame with Id and predicted popularity
predictions = pd.DataFrame({'Id': test_data['Id'], 'pop': y_pred})


In [148]:
# Save the predictions to a new CSV file
predictions.to_csv('Linear_final_3.csv', index=False)
print("Linear predictions")

Linear predictions


In [149]:
# Choose the best model (Random Forest)
best_model = models["Random Forest"]

In [150]:
# Train the best model on the full training data
best_model.fit(X_train, y_train)

In [151]:
# Predict the popularity for the test data
y_pred = best_model.predict(X_test)


In [152]:
# Create a new DataFrame with Id and predicted popularity
predictions = pd.DataFrame({'Id': test_data['Id'], 'pop': y_pred})


In [153]:
# Save the predictions to a new CSV file
predictions.to_csv('Random_Forest.csv', index=False)

print("Random Forest")

Random Forest


In [154]:
# Choose the best model (XGBoost)
best_model = models["XGBoost"]


In [155]:
# Train the best model on the full training data
best_model.fit(X_train, y_train)


In [156]:
# Predict the popularity for the test data
y_pred = best_model.predict(X_test)

In [157]:
# Create a new DataFrame with Id and predicted popularity
predictions = pd.DataFrame({'Id': test_data['Id'], 'pop': y_pred})

In [158]:
# Save the predictions to a new CSV file
predictions.to_csv('XGBoost.csv', index=False)

print("XGBoost")


XGBoost
