In [9]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import GridSearchCV

In [10]:
# Read the data
df = pd.read_csv('datasets/Spotify_YoutubeClean.csv')
df = df.set_index('id')
df = df.dropna()
# Drop columns that are not needed
df = df.drop(['Artist', 'Url_spotify', 'Track', 'Album', 'Uri', 'Url_youtube', 'Title', 'Channel', 'Views', 'Likes', 'Comments', 'Description', 'Stream', 'Popularity Score'], axis=1)

In [11]:
# Separate features X and target y
X = df.drop('Popularity', axis=1)
y = df['Popularity']

In [12]:
# Encode the target variable
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

In [13]:
# One-hot encoding of categorical variables
one_hot_encoder = OneHotEncoder(sparse_output=False)
one_hot_encoded = one_hot_encoder.fit_transform(X[['Album_type', 'Licensed', 'official_video']])

# Create DataFrame with one-hot encoded columns
one_hot_df = pd.DataFrame(one_hot_encoded, columns=one_hot_encoder.get_feature_names_out(['Album_type', 'Licensed', 'official_video']))

# Reset index of both DataFrames
X = X.reset_index(drop=True)
one_hot_df = one_hot_df.reset_index(drop=True)

# Concatenate one-hot encoded columns with original DataFrame
X_encoded = pd.concat([X, one_hot_df], axis=1)

# Drop original categorical columns
X_encoded = X_encoded.drop(['Album_type', 'Licensed', 'official_video'], axis=1)


In [14]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

In [15]:
# Define parameter grid for my hyperparameter tuning
param_grid = {
    'n_estimators': [10, 50, 100, 200],
    'max_depth': [None, 10, 20, 30]
}

# Perform grid search
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Print best parameters
print("Best Parameters:", grid_search.best_params_)

# Create RF with best parameters
rf_classifier = RandomForestClassifier(random_state=42, **grid_search.best_params_)

# Train the classifier
rf_classifier.fit(X_train, y_train)

# Predict on the test set
y_pred = rf_classifier.predict(X_test)

# Evaluate the performance
accuracy = accuracy_score(y_test, y_pred)
classification_report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", classification_report)


Best Parameters: {'max_depth': 20, 'n_estimators': 200}
Accuracy: 0.5359937402190923
Classification Report:
               precision    recall  f1-score   support

           0       0.60      0.19      0.29       796
           1       0.62      0.58      0.60      1086
           2       0.48      0.76      0.59      1551
           3       0.70      0.23      0.35       401

    accuracy                           0.54      3834
   macro avg       0.60      0.44      0.46      3834
weighted avg       0.57      0.54      0.51      3834



In [18]:
from sklearn.model_selection import StratifiedKFold


# Define evaluation metric
scoring = make_scorer(f1_score, average='macro')

# Perform stratified cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(rf_classifier, X_encoded, y, cv=cv, scoring=scoring)

print("Cross-Validation Scores:", cv_scores)
print("Mean Cross-Validation Score:", cv_scores.mean())



Cross-Validation Scores: [0.46828388 0.4671059  0.47358995 0.47674914 0.47120847]
Mean Cross-Validation Score: 0.4713874673373576
