In [34]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pickle

# My music csv dataset
file_path = r"C:\Users\HP\OneDrive\Desktop\Music-Popularity-Prediction-Pipeline-Summative\data\train\top_songs.csv"
data = pd.read_csv(file_path, encoding='ISO-8859-1')

# Preprocessing the dataset
# Cleaning the numeric columns that are stored as strings
numeric_cols = ['Spotify Streams', 'Spotify Playlist Count', 'Spotify Playlist Reach',
                'YouTube Views', 'YouTube Likes', 'TikTok Posts', 'TikTok Likes', 'TikTok Views',
                'YouTube Playlist Reach', 'AirPlay Spins', 'SiriusXM Spins',
                'Deezer Playlist Reach', 'Pandora Streams', 'Pandora Track Stations',
                'Soundcloud Streams', 'Shazam Counts']

for col in numeric_cols:
    data[col] = data[col].str.replace(',', '').astype(float)

# Dropping "TIDAL Popularity" column because it is empty in the set
data.drop(columns=['TIDAL Popularity'], inplace=True, errors='ignore')

# Fill missing values
data.fillna(data.mean(numeric_only=True), inplace=True)

# Fill missing 'Artist' values with 'Unknown'
data['Artist'].fillna('Unknown', inplace=True)

# Creating a binary target column 'High_Potential' based on 'Track Score'
track_score_median = data['Track Score'].median()
data['High_Potential'] = (data['Track Score'] > track_score_median).astype(int)

# Separate features (X) and target (y)
X = data.drop(columns=['High_Potential', 'Track', 'Album Name', 'Artist', 'Release Date', 'ISRC', 'All Time Rank', 'Track Score'])
y = data['High_Potential']

# Categorical features encoded here
categorical_columns = X.select_dtypes(include=['object']).columns
X = pd.get_dummies(X, columns=categorical_columns, drop_first=True)

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Training the Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Here i saved the model as pickle
with open('music_popularity_model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

# Model now evaluated
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Model Accuracy:", accuracy)
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

print("\nModel training complete and saved as 'music_popularity_model.pkl'.")


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Artist'].fillna('Unknown', inplace=True)


Model Accuracy: 0.7630434782608696

Classification Report:
               precision    recall  f1-score   support

           0       0.74      0.80      0.77       448
           1       0.79      0.73      0.76       472

    accuracy                           0.76       920
   macro avg       0.76      0.76      0.76       920
weighted avg       0.77      0.76      0.76       920


Confusion Matrix:
 [[357  91]
 [127 345]]

Model training complete and saved as 'music_popularity_model.pkl'.
