In [None]:
# Third-Party Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

# Tools for splitting data into train/test sets and performing hyperparameter optimization
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, KFold

# LabelEncoder: converts categorical labels into numbers; StandardScaler: normalizes data
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Metrics for evaluating model performance
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


from sklearn.ensemble import RandomForestClassifier



In [None]:
# Load data
file_path = 'spotify_tracks.csv'
spotify_tracks = pd.read_csv(file_path)

In [None]:
# Display basic info about the dataset
print(spotify_tracks.info())
print(spotify_tracks.isnull().sum())

In [None]:
# Display the first few rows of the dataset
spotify_tracks['track_genre'].value_counts()

In [None]:
# Filter dataset to include only selected genres
selected_genres = ['pop', 'country', 'hip-hop', 'punk_rock', 'latin', 'edm']
spotify_tracks = spotify_tracks[spotify_tracks['track_genre'].isin(selected_genres)]

In [None]:
# Drop unnecessary columns
columns_to_drop = ['Unnamed: 0', 'track_id', 'track_name', 'artists', 'album_name', 'time_signature']
spotify_tracks.drop(columns=columns_to_drop, axis=1, inplace=True)

In [None]:
# Handle missing values
print("Missing values before dropping:", spotify_tracks.isnull().sum())
spotify_tracks.dropna(inplace=True)
print("Missing values after dropping:", spotify_tracks.isnull().sum())

In [None]:
# Separate features and target
features = spotify_tracks.drop(columns=['track_genre'])
target = spotify_tracks['track_genre']

In [None]:
# Split data into train, validation, and test sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

In [None]:
# Initialize LabelEncoder to convert genre labels to numbers
label_encoder = LabelEncoder()

# Fit the label encoder on the training labels and transform all sets
y_train = label_encoder.fit_transform(y_train)
y_val = label_encoder.transform(y_val)
y_test = label_encoder.transform(y_test)

In [None]:
# Initialize StandardScaler to normalize the feature data
scaler = StandardScaler()

# Fit the scaler on the training data and transform all sets
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

In [None]:
# Initialize the Random Forest Classifier with a fixed random state for reproducibility
rf_model = RandomForestClassifier(random_state=42)

rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_val)

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# Classification Report
report = classification_report(y_test, y_pred, output_dict=True)
report_df = pd.DataFrame(report).transpose()

plt.figure(figsize=(8, 4))
sns.heatmap(report_df.iloc[:-1, :-1], annot=True, cmap='YlGnBu')
plt.title("Classification Report")
plt.show()


test_accuracy = accuracy_score(y_val, y_pred)
print(f'Test Accuracy: {test_accuracy:.4f}')
print('\nClassification Report: \n', classification_report(y_val, y_pred))
print('\nConfusion Matrix: \n', confusion_matrix(y_val, y_pred))


In [None]:
# Hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [10, 100, 200, 300], # Number of trees in the forest
    'max_depth': [3, 5, 10, 20, None],
    'max_features':['sqrt', 'log2', None], # Number of features to consider at each split
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

rf_model = RandomForestClassifier(random_state=42)

# Perform grid search with cross-validation
grid_search = GridSearchCV(rf_model, param_grid, cv=10, scoring='accuracy', n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

best_rf = grid_search.best_estimator_

print('Best parameters:', grid_search.best_params_)
print('Best score:', grid_search.best_score_)



In [None]:
feature_importance = best_rf.feature_importances_ # Get feature importances from the best model
feature_names = X.columns # Get the names of the features (columns) from the dataset used for training

In [None]:
#Sort and plot the feature importances
sorted_indices = np.argsort(feature_importance)[::-1]
plt.figure(figsize=(10, 5))
plt.bar(range(len(feature_importance)), feature_importance[sorted_indices], align='center')
plt.xticks(range(len(feature_importance)), np.array(feature_names)[sorted_indices], rotation=90)
plt.title('Random Forest Feature Importance for Music Tracks')
plt.show()
