In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score

In [3]:
# Read the data
df = pd.read_csv('datasets/Spotify_YoutubeClean.csv')
df = df.set_index('id')
df = df.dropna()
# Drop columns that are not needed
df = df.drop(['Artist', 'Url_spotify', 'Track', 'Album', 'Uri', 'Url_youtube', 'Title', 'Channel', 'Views', 'Likes', 'Comments', 'Description', 'Stream', 'Popularity Score'], axis=1)

In [4]:
# Separate features X and target y
X = df.drop('Popularity', axis=1)
y = df['Popularity']

In [5]:
# Encode the target variable
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

In [6]:
# One-hot encoding of categorical variables
X_encoded = pd.get_dummies(X, columns=['Album_type', 'Licensed', 'official_video'])

In [7]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

In [8]:
# Create RF
rf_classifier = RandomForestClassifier(random_state=42)

# Train the classifier
rf_classifier.fit(X_train, y_train)

# Predict on the test set
y_pred = rf_classifier.predict(X_test)

# Evaluate the performance
accuracy = accuracy_score(y_test, y_pred)
classification_report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", classification_report)

Accuracy: 0.5320813771517997
Classification Report:
               precision    recall  f1-score   support

           0       0.62      0.58      0.60      1086
           1       0.50      0.25      0.33       796
           2       0.49      0.72      0.58      1551
           3       0.65      0.23      0.34       401

    accuracy                           0.53      3834
   macro avg       0.57      0.45      0.46      3834
weighted avg       0.55      0.53      0.51      3834



In [9]:
# Perform cross-validation
cv_scores = cross_val_score(rf_classifier, X_encoded, y, cv=5)  # 5-fold cross-validation

print("Cross-Validation Scores:", cv_scores)
print("Mean Cross-Validation Score:", cv_scores.mean())

Cross-Validation Scores: [0.52529995 0.50130412 0.52895149 0.52217006 0.49061033]
Mean Cross-Validation Score: 0.5136671883150756
