In [None]:
import pandas as pd

# Load the dataset
file_path = '../dataset.csv'
data = pd.read_csv(file_path)

# Display the first few rows and dataset information
print("Initial Data:")
print(data.head())
print("\nData Information:")
data.info()

# Step 1: Drop the 'Unnamed: 0' column as it's redundant
data_cleaned = data.drop(columns=['Unnamed: 0'])
print("\nData after dropping 'Unnamed: 0':")

# Step 2: Handle missing values by dropping rows with missing values in critical columns
data_cleaned = data_cleaned.dropna(subset=['artists', 'album_name', 'track_name'])
print("\nMissing values after cleaning:")
print(data_cleaned.isnull().sum())

# Step 3: Remove duplicates based on 'track_id'
data_cleaned = data_cleaned.drop_duplicates(subset=['track_id'])
print("\nShape of the data after removing duplicates:", data_cleaned.shape)

# Final Cleaned Data Overview
print("\nData Overview:")
print(data_cleaned.describe())

output_path = '../cleaned_dataset.csv'
data_cleaned.to_csv(output_path, index=False)

print(f"Cleaned data saved as {output_path}")


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report
from sklearn.cluster import KMeans
import joblib

# Load the cleaned dataset
data = pd.read_csv('../cleaned_dataset.csv')

# Define features for clustering
features_for_clustering = ['danceability', 'energy', 'valence', 'tempo', 'key', 'loudness', 'mode', 
                           'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'time_signature']

# Fit k-means clustering for all features
kmeans_all = KMeans(n_clusters=4, random_state=42).fit(data[features_for_clustering])
data['cluster'] = kmeans_all.labels_

# Define labels based on clusters
data['is_happy'] = data['cluster'].apply(lambda x: 1 if x == 0 else 0)  # Example mapping
data['is_sad'] = data['cluster'].apply(lambda x: 1 if x == 1 else 0)
data['is_fast'] = data['cluster'].apply(lambda x: 1 if x == 2 else 0)
data['is_slow'] = data['cluster'].apply(lambda x: 1 if x == 3 else 0)

# Plot the distribution of labels
plt.figure(figsize=(12, 6))
sns.countplot(data=data, x='is_happy', label='Happy')
sns.countplot(data=data, x='is_sad', label='Sad')
sns.countplot(data=data, x='is_fast', label='Fast')
sns.countplot(data=data, x='is_slow', label='Slow')

plt.title('Distribution of Labels')
plt.xlabel('Label')
plt.ylabel('Count')
plt.legend(['Happy', 'Sad', 'Fast', 'Slow'])
plt.show()

# Plot feature distributions for all features
data[features_for_clustering].hist(figsize=(14, 12))
plt.suptitle('Feature Distributions')
plt.show()

# Define features and targets
X = data[features_for_clustering]
y = data[['is_happy', 'is_sad', 'is_fast', 'is_slow']]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train Multi-Output Classifier
classifier = RandomForestClassifier(random_state=42)
multi_target_classifier = MultiOutputClassifier(classifier, n_jobs=-1)
multi_target_classifier.fit(X_train_scaled, y_train)

# Predict on the test data
y_pred = multi_target_classifier.predict(X_test_scaled)

# Evaluate the model
print("Classification Report:\n")
print(classification_report(y_test, y_pred, target_names=['Happy', 'Sad', 'Fast', 'Slow']))

# Plot the distribution of new labels
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
sns.countplot(data=data, x='is_happy')
plt.title('Distribution of Happy Labels')

plt.subplot(1, 2, 2)
sns.countplot(data=data, x='is_sad')
plt.title('Distribution of Sad Labels')

plt.tight_layout()
plt.show()

plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
sns.countplot(data=data, x='is_fast')
plt.title('Distribution of Fast Labels')

plt.subplot(1, 2, 2)
sns.countplot(data=data, x='is_slow')
plt.title('Distribution of Slow Labels')

plt.tight_layout()
plt.show()

# Save the scaler
joblib.dump(scaler, '../pkl/scaler.pkl')

# Save the multi-output classifier
joblib.dump(multi_target_classifier, '../pkl/multi_target_classifier.pkl')


In [None]:
import os
from dotenv import load_dotenv
import spotipy
from spotipy.oauth2 import SpotifyOAuth
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
import joblib

load_dotenv()

client_id = os.getenv('CLIENT_ID')
client_secret = os.getenv('CLIENT_SECRET')

sp = spotipy.Spotify(auth_manager=SpotifyOAuth(client_id=client_id,
                                               client_secret=client_secret,
                                               redirect_uri='https://google.com'))

playlist_id = '2uORYX3pVmRBUJe8uXrK8H'
results = sp.playlist_tracks(playlist_id)
tracks = results['items']

data = []
urls = []

for track in tracks:
    song_id = track['track']['id']
    features = sp.audio_features(song_id)[0]
    if features is not None:
        data.append({
            'danceability': features['danceability'],
            'energy': features['energy'],
            'valence': features['valence'],
            'tempo': features['tempo'],
            'key': features['key'],
            'loudness': features['loudness'],
            'mode': features['mode'],
            'speechiness': features['speechiness'],
            'acousticness': features['acousticness'],
            'instrumentalness': features['instrumentalness'],
            'liveness': features['liveness'],
            'time_signature': features['time_signature']
        })
        urls.append(track['track']['external_urls']['spotify'])

features_df = pd.DataFrame(data)

# Load the scaler and classifier
scaler = joblib.load('../pkl/scaler.pkl')
multi_target_classifier = joblib.load('../pkl/multi_target_classifier.pkl')

# Ensure the features used match the training set
required_features = ['danceability', 'energy', 'valence', 'tempo', 'key', 'loudness', 'mode',
                     'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'time_signature']
features_df = features_df[required_features]

# Scale the features
features_scaled = scaler.transform(features_df)
predicted_categories = multi_target_classifier.predict(features_scaled)

categories = ['Happy', 'Sad', 'Fast', 'Slow']
results = []

for i, track in enumerate(tracks):
    happy_sad = [categories[j] for j, val in enumerate(predicted_categories[i]) if j < 2 and val == 1]
    fast_slow = [categories[j] for j, val in enumerate(predicted_categories[i]) if j >= 2 and val == 1]
    
    # Ensure exactly one category from each group
    if not happy_sad:
        happy_sad = ['Happy'] if 'Happy' in categories[:2] else ['Sad']
    if not fast_slow:
        fast_slow = ['Fast'] if 'Fast' in categories[2:] else ['Slow']
    
    # Combine results
    combined_labels = happy_sad + fast_slow
    
    results.append({
        'track': track['track']['name'],
        'artist': track['track']['artists'][0]['name'],
        'categories': ', '.join(combined_labels),
        'url': urls[i]
    })

for result in results:
    print(f"Track: {result['track']}\nArtist: {result['artist']}\nCategories: {result['categories']}\nURL: {result['url']}\n")
