In [None]:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np

In [None]:
df = pd.read_csv("personalized_music_recommendation_dataset.csv")
df.head(10)

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
numeric_features = df.select_dtypes(include=['float64', 'int64']).columns
df[numeric_features].hist(bins=15, figsize=(15, 15))
plt.show()

In [None]:
numeric_df = df.select_dtypes(include=[float, int])

plt.figure(figsize=(20, 15))
sns.heatmap(numeric_df.corr(), annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
plt.figure(figsize=(10, 2))
sns.countplot(y='context_type', data=df, order=df['context_type'].value_counts().index)
plt.title('Context Distribution')
plt.show()

In [None]:
plt.figure(figsize=(10, 2))
sns.countplot(y='genre', data=df, order=df['genre'].value_counts().index)
plt.title('Genre Distribution')
plt.show()

In [None]:
plt.figure(figsize=(10, 2))
sns.countplot(y='artist', data=df, order=df['artist'].value_counts().index)
plt.title('Artist Distribution')
plt.show()

In [None]:
plt.figure(figsize=(10, 1))
sns.countplot(y='subscription_type', data=df, order=df['subscription_type'].value_counts().index)
plt.title('subscription_type Distribution')
plt.show()

In [None]:
df["timestamp"]

In [None]:
df['timestamp_parsed'] = pd.to_datetime(df['timestamp'], errors='coerce')  # parsed copy

In [None]:
df['hour_of_day'] = df['timestamp_parsed'].dt.hour + df['timestamp_parsed'].dt.minute/60.0 + df['timestamp_parsed'].dt.second/3600.0
df['day_of_week'] = df['timestamp_parsed'].dt.weekday            # 0=Mon .. 6=Sun
df['is_weekend'] = df['day_of_week'].isin([5,6]).astype(int)
df['month'] = df['timestamp_parsed'].dt.month
df['minutes_since_midnight'] = df['timestamp_parsed'].dt.hour*60 + df['timestamp_parsed'].dt.minute


In [None]:
df.info()

In [None]:
df['time_rad'] = (df['hour_of_day'] / 24.0) * 2 * np.pi
df['time_sin'] = np.sin(df['time_rad'])
df['time_cos'] = np.cos(df['time_rad'])
df.drop(columns=['time_rad'], inplace=True)

In [None]:
df = df.sort_values(['user_id', 'timestamp_parsed']).reset_index(drop=True)
df['prev_ts'] = df.groupby('user_id')['timestamp_parsed'].shift(1)
df['time_since_prev_secs'] = (df['timestamp_parsed'] - df['prev_ts']).dt.total_seconds()
df['time_since_prev_secs'] = df['time_since_prev_secs'].fillna(-1)  # -1 indicates first interaction
df.drop(columns=['prev_ts'], inplace=True)

In [None]:
SESSION_GAP_SECS = 30 * 60 

In [None]:
df = df.sort_values(['user_id', 'timestamp_parsed']).reset_index(drop=True)
df['gap_flag'] = (df['time_since_prev_secs'] > SESSION_GAP_SECS).astype(int)
df['session_id'] = df.groupby('user_id')['gap_flag'].cumsum().astype(int)
df['session_uid'] = df['user_id'].astype(str) + "_sess_" + df['session_id'].astype(str)
df['is_new_session'] = df['gap_flag'] == 1
df.drop(columns=['gap_flag'], inplace=True)

In [None]:
df['plays_last_24h'] = 0
window = pd.Timedelta('24 hours')
# efficient approach per user
for uid, g in df.groupby('user_id'):
    g = g.sort_values('timestamp_parsed')
    ts = g['timestamp_parsed'].reset_index(drop=True)
    counts = []
    left = 0
    for i, t in enumerate(ts):
        while left < i and (t - ts[left]) > window:
            left += 1
        counts.append(i - left)  # number of prior plays in the 24h window
    df.loc[g.index, 'plays_last_24h'] = counts

In [None]:
user_like_rate = df.groupby('user_id')['liked'].mean().rename('user_like_rate')
df = df.merge(user_like_rate, on='user_id', how='left')

In [None]:
song_global_like = df.groupby('song_id')['liked'].mean().rename('song_global_like')
df = df.merge(song_global_like, on='song_id', how='left')

In [None]:
song_play_count = df.groupby('song_id')['play_count'].sum().rename('song_play_count')
df = df.merge(song_play_count, on='song_id', how='left')

In [None]:
avg_listen = df.groupby('user_id')['listening_time_mins'].mean().rename('avg_listening_time_user')
df = df.merge(avg_listen, on='user_id', how='left')

In [None]:
df['is_repeat'] = (df['repeat_count'] > 1).astype(int)

In [None]:
display(df.head(5))

In [None]:
plt.figure(figsize=(2,4))
sns.countplot(x='liked', data=df)
plt.title('Target distribution: liked (0 vs 1)')
plt.show()
print("Liked distribution (normalized):")
print(df['liked'].value_counts(normalize=True))

In [None]:
numeric_cols = df.select_dtypes(include=['float64','int64']).columns.tolist()

In [None]:
corr_with_target = df[numeric_cols].corr()['liked'].drop('liked').sort_values(key=lambda x: x.abs(), ascending=False)
print("\nNumeric features correlation with 'liked' (abs-sorted):")
# display(corr_with_target.to_frame(name='corr_with_liked').round(3))

# create heatmap visualization
plt.figure(figsize=(25,2))
sns.heatmap(
    corr_with_target.to_frame().T, 
    annot=True, 
    cmap='coolwarm', 
    fmt='.2f', 
    cbar=False,
    linewidths=0.5
)
plt.title("Correlation of Numeric Features with Target ('liked')")
plt.yticks(rotation=0)
plt.xticks(rotation=45, ha='right')
plt.show()

In [None]:
print("\nTop 10 songs by occurrence:")
display(df['song_id'].value_counts().head(10))

In [None]:
print("\nTop 10 most active users:")
display(df['user_id'].value_counts().head(10))

In [None]:
audio_feats = [f for f in ['energy','danceability','valence','acousticness','loudness'] if f in df.columns]


In [None]:
sample_n = min(1000, len(df))
sample_for_plot = df[audio_feats + (['liked'] if 'liked' in df.columns else [])].sample(n=sample_n, random_state=1)
sns.pairplot(sample_for_plot, vars=audio_feats, hue='liked' if 'liked' in sample_for_plot.columns else None, plot_kws={'alpha':0.4})
plt.suptitle('Pairplot of audio features (sampled)', y=1.02)
plt.show()

In [None]:
print("\n=== Modeling & Evaluation ===")
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, confusion_matrix, classification_report
# import joblib

In [None]:
candidate_features = [
    'energy', 'danceability', 'valence', 'acousticness', 'loudness',
    'tempo', 'duration_sec', 'popularity', 'recent_skip_rate',
    'listening_time_mins', 'sessions_per_day',
    'time_sin', 'time_cos', 'song_global_like', 'user_like_rate',
    'avg_listening_time_user', 'song_play_count', 'is_repeat',
    'time_since_prev_secs', 'plays_last_24h'
]

features = [c for c in candidate_features if c in df.columns]
print(f"Using {len(features)} features: {features}")

In [None]:
target = 'liked'

In [None]:
df_model = df.copy()
X = df_model[features]
y = df_model[target]
print("Dataset for modeling -> shape:", X.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print("Train size:", len(X_train), "Test size:", len(X_test))

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
song_popularity = df_model.groupby('song_id')['liked'].mean().sort_values(ascending=False)
print("\nTop 10 songs by average liked ratio (train subset):")
display(song_popularity.head(10))


In [None]:
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)
y_pred_lr = lr.predict(X_test_scaled)
y_pred_lr_class = (y_pred_lr >= 0.5).astype(int)

print("\nLinear Regression Results:")
print("R^2:", round(r2_score(y_test, y_pred_lr), 4))
print("MSE:", round(mean_squared_error(y_test, y_pred_lr), 4))
print("Accuracy (threshold 0.5):", round(accuracy_score(y_test, y_pred_lr_class), 4))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred_lr_class))

In [None]:
ridge = Ridge(alpha=1.0)
ridge.fit(X_train_scaled, y_train)
y_pred_ridge = ridge.predict(X_test_scaled)
y_pred_ridge_class = (y_pred_ridge >= 0.5).astype(int)

print("\nRidge Regression Results:")
print("R^2:", round(r2_score(y_test, y_pred_ridge), 4))
print("MSE:", round(mean_squared_error(y_test, y_pred_ridge), 4))
print("Accuracy (threshold 0.5):", round(accuracy_score(y_test, y_pred_ridge_class), 4))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred_ridge_class))


In [None]:
knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(X_train_scaled, y_train)
y_pred_knn = knn.predict(X_test_scaled)

print("\nKNN Results:")
print("Accuracy:", round(accuracy_score(y_test, y_pred_knn), 4))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred_knn))
print("\nClassification report:\n", classification_report(y_test, y_pred_knn, zero_division=0))


In [None]:
coeffs = ridge.coef_
feat_importance = pd.DataFrame({'feature': features, 'coefficient': coeffs})
feat_importance['abs_coef'] = feat_importance['coefficient'].abs()
feat_importance = feat_importance.sort_values('abs_coef', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(data=feat_importance, x='coefficient', y='feature')
plt.title('Feature importance (Ridge coefficients)')
plt.tight_layout()
plt.show()
print("\nFeature importance (top rows):")
display(feat_importance[['feature', 'coefficient']].head(20).round(4))


In [None]:
results_summary = pd.DataFrame({
    'Model': ['LinearRegression', 'Ridge', 'KNN'],
    'R2': [
        round(r2_score(y_test, y_pred_lr), 4),
        round(r2_score(y_test, y_pred_ridge), 4),
        np.nan
    ],
    'MSE': [
        round(mean_squared_error(y_test, y_pred_lr), 4),
        round(mean_squared_error(y_test, y_pred_ridge), 4),
        np.nan
    ],
    'Accuracy': [
        round(accuracy_score(y_test, y_pred_lr_class), 4),
        round(accuracy_score(y_test, y_pred_ridge_class), 4),
        round(accuracy_score(y_test, y_pred_knn), 4)
    ]
})
print("\nModel performance summary:")
display(results_summary)

In [None]:
# Import the SVM classifier
from sklearn.svm import SVC
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=42, stratify=y)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create the SVM model. 
# 'rbf' is the default and a great choice for non-linear data.
# C and gamma are important hyperparameters. C=1.0 is a reasonable default.
svm = SVC(kernel='rbf', C=1.0, random_state=42)

# Train the model on the scaled training data
print("Training the SVM model... (this may take a moment)")
svm.fit(X_train_scaled, y_train)

# Make predictions on the scaled test data
y_pred_svm = svm.predict(X_test_scaled)

# Evaluate the model's performance
print("\nSVM Classifier Results:")
print("Accuracy:", round(accuracy_score(y_test, y_pred_svm), 4))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred_svm))
print("\nClassification report:\n", classification_report(y_test, y_pred_svm, zero_division=0))

In [None]:
# (Make sure to run the SVM cell first!)

results_summary = pd.DataFrame({
    'Model': ['SVM'],
    'Accuracy': [
        round(accuracy_score(y_test, y_pred_svm), 4) # Add the SVM accuracy here
    ]
})

print("\nUpdated Model performance summary:")
display(results_summary.sort_values(by='Accuracy', ascending=False))

# By running this, you will have a clear, direct comparison of how well the SVM performs against your previous models. Given the non-linear nature of your data, it's very likely that the SVM will outperform the linear and ridge regression models and will be competitive with, if not better than, the KNN.