In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv("Dataset/universal_top_spotify_songs.csv")
df.head()

In [None]:

def forward_stepwise_selection(X, y):
    selected_features = []
    remaining_features = list(X.columns)
    best_score = float('inf')  # Inizializza con un valore molto grande

    while remaining_features:
        scores = []
        for feature in remaining_features:
            model_features = selected_features + [feature]
            X_subset = X[model_features]
            X_train, X_test, y_train, y_test = train_test_split(X_subset, y, test_size=0.2, random_state=42)
            model = LinearRegression()
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            score = mean_squared_error(y_test, y_pred)
            scores.append((score, feature))
        
        scores.sort()
        best_score, best_feature = scores[0]
        if best_score < best_score:
            selected_features.append(best_feature)
            remaining_features.remove(best_feature)
        else:
            break
        
    return selected_features

In [None]:
def backward_stepwise_selection(X, y):
    remaining_features = list(X.columns)
    best_score = mean_squared_error(y, np.zeros_like(y))  # MSE del modello con nessuna feature

    while len(remaining_features) > 1:
        scores = []
        for feature in remaining_features:
            model_features = remaining_features.copy()
            model_features.remove(feature)
            X_subset = X[model_features]
            X_train, X_test, y_train, y_test = train_test_split(X_subset, y, test_size=0.2, random_state=42)
            model = LinearRegression()
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            score = mean_squared_error(y_test, y_pred)
            scores.append((score, feature))
        
        scores.sort()
        best_score, worst_feature = scores[0]
        if best_score < best_score:
            remaining_features.remove(worst_feature)
        else:
            break
        
    return remaining_features

In [None]:

numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
X = df[numeric_columns].drop(['daily_rank'], axis=1)  # Rimuovi 'daily_rank' dalle features
y = df['daily_rank']


selected_features_forward = forward_stepwise_selection(X, y)
print("Selected features (Forward Stepwise):", selected_features_forward)


selected_features_backward = backward_stepwise_selection(X, y)
print("Selected features (Backward Stepwise):", selected_features_backward)
