# Import required Libraries

In [None]:
# Import Required Libraries
import pandas as pd
import numpy as np
import difflib
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity

# Load the Dataset

In [None]:
# Load Dataset
df = pd.read_csv("resources/SpotifyFeatures.csv")
# View basic info
df.info()  # Check column types & missing values
df.head()  # Preview first few rows


# Data Preprocessing

In [None]:
# Preprocess Data
df["mode"] = df["mode"].map({"Major": 1, "Minor": 0})  # Convert Major/Minor to numeric

# Encode artist names into numeric values
artist_encoder = LabelEncoder()
df["artist_encoded"] = artist_encoder.fit_transform(df["artist_name"])

# Select numeric features, including artist encoding
numeric_features = ["danceability", "energy", "tempo", "acousticness",
                    "instrumentalness", "valence", "loudness", "mode", "artist_encoded"]

# Sample subset for efficiency
df_sample = df.sample(5000, random_state=42).reset_index(drop=True)


# Feature Scaling

In [None]:
# Scale features
scaler = StandardScaler()
X_scaled_df = pd.DataFrame(scaler.fit_transform(df_sample[numeric_features]), columns=numeric_features)

# Apply PCA for Dimensionality Reduction

In [None]:
# Apply PCA for Dimensionality Reduction
pca = PCA(n_components=7)  
X_pca = pca.fit_transform(X_scaled_df)
print(f"Explained Variance Ratio: {sum(pca.explained_variance_ratio_):.2f}")  # Check variance retention

# Create DataFrame for PCA results
pca_df = pd.DataFrame(X_pca, columns=["PC1", "PC2", "PC3", "PC4", "PC5", "PC6", "PC7"])
pca_df["artist_name"] = df_sample["artist_name"].values
pca_df["original_index"] = df_sample.index  # Keep original index for mapping clusters
pca_df.head()  # Preview PCA DataFrame

# Apply K-Means Clustering

In [None]:
# Perform K-Means Clustering
kmeans = KMeans(n_clusters=5, random_state=42, n_init=10)
df_sample["Cluster"] = kmeans.fit_predict(X_pca)

# Merge Cluster Labels into PCA DataFrame
pca_df["Cluster"] = df_sample.loc[pca_df["original_index"], "Cluster"].values


In [None]:
# Visualize PCA Clustering
sns.scatterplot(
    x=pca_df["PC1"], 
    y=pca_df["PC2"], 
    hue=pca_df["Cluster"].astype(str), 
    palette="viridis",
    alpha=0.7
)
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.title("PCA Clusters Visualization")
plt.legend(title="Cluster")
plt.show()

# Song Recommendation System

In [24]:
import pandas as pd
import difflib
from sklearn.metrics.pairwise import cosine_similarity

# Combine scaled features with original song info
df_scaled = pd.concat([df_sample.reset_index(drop=True), X_scaled_df.reset_index(drop=True)], axis=1)

# Remove duplicate columns that could cause errors
df_scaled = df_scaled.loc[:, ~df_scaled.columns.duplicated()]

# Define function to find closest matches for song names **with artist**
def find_closest_match(input_song, cutoff=0.6):
    """Finds the closest match to the input song name along with the artist."""
    song_list = df_scaled["track_name"].dropna().tolist()

    # **Exact match check before running similarity search**
    if input_song in song_list:
        matched_song = input_song
        matched_artist = df_scaled.loc[df_scaled["track_name"] == matched_song, "artist_name"].values[0]
        print(f"Match found: '{matched_song}' by {matched_artist}")
        return matched_song, matched_artist  # Use exact match directly

    # Fuzzy matching for closest name
    closest_match = difflib.get_close_matches(input_song, song_list, n=1, cutoff=cutoff)

    if closest_match:
        matched_song = closest_match[0]
        matched_artist = df_scaled.loc[df_scaled["track_name"] == matched_song, "artist_name"].values[0]
        print(f"Please check the spellings as there is no song named '{input_song}'. \nClosest match found: '{matched_song}' by {matched_artist}")
        return matched_song, matched_artist  # Return both song & artist
    else:
        return None, None  # No match found

# Define song recommendation function using cosine similarity
def recommend_song(input_song, cutoff=0.6, similarity_threshold=0.5):
    """Finds songs similar to the input song from different artists."""

    # Handle closest match if song isn't found
    matched_song, matched_artist = find_closest_match(input_song, cutoff)
    
    if not matched_song:
        return f"Song '{input_song}' not found."

    # Find the index of the matched song in df_sample (which matches X_scaled_df)
    matched_idx = df_sample[df_sample["track_name"] == matched_song].index
    if len(matched_idx) == 0:
        return f"Matched song '{matched_song}' not found in sample."

    # Get the scaled features for the matched song
    song_features = X_scaled_df.loc[matched_idx[0], numeric_features].values.reshape(1, -1)

    # Drop rows with missing values in X_scaled_df
    valid_rows = X_scaled_df.dropna(subset=numeric_features).index
    X_scaled_valid = X_scaled_df.loc[valid_rows, numeric_features]

    # Compute similarity
    similarities = cosine_similarity(song_features, X_scaled_valid)

    # Normalize similarity scores
    similarity_scores = similarities[0] / similarities[0].max()

    # Prepare recommendations DataFrame (exclude the original artist)
    df_valid = df_sample.loc[valid_rows].copy()
    df_valid["Similarity"] = similarity_scores

    input_artist = df_valid.loc[df_valid["track_name"] == matched_song, "artist_name"].values[0]
    recommendations = df_valid[(df_valid["artist_name"] != input_artist) & (df_valid["Similarity"] >= similarity_threshold)]

    # Sort by similarity first, then danceability & energy
    recommendations = recommendations.sort_values(by=["Similarity", "danceability", "energy"], ascending=[False, False, False]).head(10)

    if recommendations.empty:
        return f"No recommendations found for '{matched_song}'. Try adjusting filters or checking dataset."

    # **Return only track name and artist name**
    return recommendations[["track_name", "artist_name"]]

# Test Case
test_song = "Symphony No.4 In E Minor Op.98 : IV. Allegro Energico E Passionato"

# Run recommendation
recommended_songs = recommend_song(test_song)
print("\n*Recommended song*\n")
recommended_songs.head()



Please check the spellings as there is no song named 'Symphony No.4 In E Minor Op.98 : IV. Allegro Energico E Passionato'. 
Closest match found: 'Symphony No.4 In E Minor Op.98 : I. Allegro Non Troppo' by Leopold Stokowski

*Recommended song*



Unnamed: 0,track_name,artist_name
3765,Le Petit Poucet - Main Theme,Joe Hisaishi
4065,When She Came Back,Max Richter
4626,Harvest Dawn,Jeremy Soule
2002,Rue's Farewell,James Newton Howard
2439,"Symphony No. 9, Op. 125 in D Minor -1895 Gusta...",Ludwig van Beethoven


In [None]:
# Save the PCA DataFrame to CSV for further analysis
pca_df.to_csv("resources/pca_results.csv", index=False)
# Save the scaled DataFrame to CSV for further analysis
df_scaled.to_csv("resources/scaled_features.csv", index=False)
# Save the original DataFrame with clusters to CSV for further analysis
df_sample.to_csv("resources/original_with_clusters.csv", index=False)