<a href="https://colab.research.google.com/github/Serag11/Machine-Learning/blob/main/Random_forest_XGBoost_on_spotify_dataset_NTI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install opendatasets
import pandas as pd
import opendatasets as od
od.download('https://www.kaggle.com/datasets/maharshipandya/-spotify-tracks-dataset')
df = pd.read_csv('/content/-spotify-tracks-dataset/dataset.csv')

In [None]:
!pip install ydata-profiling
from ydata_profiling import ProfileReport

profile = ProfileReport(df, title="Profiling Report")
profile.to_notebook_iframe()

In [None]:
df.head()

In [None]:
df['artists'].value_counts()

In [None]:
track_genre_counts = df.groupby('track_id')['track_genre'].nunique()
print(track_genre_counts.head(10))

In [None]:
tracks_to_exclude = track_genre_counts[track_genre_counts > 1].index
df_filtered = df[~df['track_id'].isin(tracks_to_exclude)]
print(f"Original DataFrame shape: {df.shape}")
print(f"Filtered DataFrame shape: {df_filtered.shape}")
display(df_filtered.head())


In [None]:
df_filtered.drop(columns= ['Unnamed: 0' , 'artists' , 'album_name' , 'track_name' , 'duration_ms' , 'explicit' ] , inplace= True)
df_filtered.info()

In [None]:
df_filtered.drop_duplicates(inplace= True , subset= 'track_id')
df_filtered.info()

In [None]:
print(f"number of unique values in the original df {df['track_id'].nunique()}")
print(f"number of unique values in the filtered df {df_filtered['track_id'].nunique()}")

In [None]:
df_filtered.drop(columns= ['track_id'] , inplace= True)
df_filtered.head()

In [None]:
df_filtered.drop(columns= ['time_signature'] , inplace= True)
df_filtered.head()

In [None]:
df_filtered.drop(columns= [ 'key' , 'mode'] , inplace= True)
df_filtered.head()

In [None]:
from sklearn.preprocessing import OneHotEncoder , StandardScaler

encoder = OneHotEncoder(sparse_output=False)
encoded_df = pd.DataFrame(encoder.fit_transform(df_filtered[['track_genre']]))

scalar = StandardScaler()
scaled_df = pd.DataFrame(scalar.fit_transform(df_filtered.drop(columns= ['track_genre'])))

y = encoded_df
X = scaled_df

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np

# Initialize the Logistic Regression model for multiclass classification
# 'lbfgs' is a good solver for multiclass problems
# 'multinomial' uses softmax regression for multiclass classification
# Increased max_iter to ensure convergence
model = LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=1000, random_state=42)

# Convert y_train and y_test from one-hot encoded to label encoded format
y_train_labels = np.argmax(y_train.values, axis=1)
y_test_labels = np.argmax(y_test.values, axis=1)

# Train the model
model.fit(X_train, y_train_labels)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test_labels, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Initialize the Random Forest Classifier model
# You can tune hyperparameters like n_estimators (number of trees) for better performance
model_rf = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)

# Train the model using the label-encoded target variable
model_rf.fit(X_train, y_train_labels)

# Make predictions on the test set
y_pred_rf = model_rf.predict(X_test)

# Evaluate the model
accuracy_rf = accuracy_score(y_test_labels, y_pred_rf)
print(f"Random Forest Model Accuracy: {accuracy_rf:.4f}")

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# Initialize the XGBoost Classifier model
# You can tune hyperparameters for better performance
model_xgb = XGBClassifier(objective='multi:softmax', num_class=len(np.unique(y_train_labels)), eval_metric='mlogloss', use_label_encoder=False, random_state=42, n_jobs=-1)

# Train the model using the label-encoded target variable
model_xgb.fit(X_train, y_train_labels)

# Make predictions on the test set
y_pred_xgb = model_xgb.predict(X_test)

# Evaluate the model
accuracy_xgb = accuracy_score(y_test_labels, y_pred_xgb)
print(f"XGBoost Model Accuracy: {accuracy_xgb:.4f}")