<a href="https://colab.research.google.com/github/Shresth-Agarwal/FMML_Project_And_Labs/blob/main/SongRecommender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Project Title:** Song Recommendation System

**Goal:** To build a system that suggests songs to users based on their taste, potentially using natural language processing (NLP) to understand text-based information related to songs or user preferences.

**Approach:** Use machine learning, possibly incorporating NLP techniques, to analyze song data and user behavior to recommend personalized song suggestions.

## Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re

In [10]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import wordnet
from sklearn.feature_extraction.text import TfidfVectorizer
from bs4 import BeautifulSoup
from sklearn import metrics
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

## Loading Datasets

In [4]:
from google.colab import files
uploaded = files.upload()

Saving songdata.csv to songdata.csv


In [44]:
from google.colab import files
uploaded = files.upload()

Saving popular_songs.csv to popular_songs (1).csv


## Data Exploration and Preprocessing

In [5]:
df = pd.read_csv('songdata.csv')
df.head()

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [45]:
df2 = pd.read_csv('popular_songs.csv', encoding = 'latin-1')
df2.head()

Unnamed: 0,track_name,artist(s)_name,artist_count,released_year,released_month,released_day,in_spotify_playlists,in_spotify_charts,streams,in_apple_playlists,...,bpm,key,mode,danceability_%,valence_%,energy_%,acousticness_%,instrumentalness_%,liveness_%,speechiness_%
0,Seven (feat. Latto) (Explicit Ver.),"Latto, Jung Kook",2,2023,7,14,553,147,141381703,43,...,125,B,Major,80,89,83,31,0,8,4
1,LALA,Myke Towers,1,2023,3,23,1474,48,133716286,48,...,92,C#,Major,71,61,74,7,0,10,4
2,vampire,Olivia Rodrigo,1,2023,6,30,1397,113,140003974,94,...,138,F,Major,51,32,53,17,0,31,6
3,Cruel Summer,Taylor Swift,1,2019,8,23,7858,100,800840817,116,...,170,A,Major,55,58,72,11,0,11,15
4,WHERE SHE GOES,Bad Bunny,1,2023,5,18,3133,50,303236322,84,...,144,A,Minor,65,23,80,14,63,11,6


In [6]:
print(df.shape)
print(df.isnull().sum())

(57650, 4)
artist    0
song      0
link      0
text      0
dtype: int64


In [46]:
df2.shape

(953, 24)

In [7]:
df = df.drop('link', axis=1).reset_index(drop=True)
print(df.shape)
df.head()

(57650, 3)


Unnamed: 0,artist,song,text
0,ABBA,Ahe's My Kind Of Girl,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante","Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,Making somebody happy is a question of give an...


In [47]:
columns_to_drop = [
    'artist_count', 'released_year', 'released_month', 'released_day',
    'in_spotify_playlists', 'in_spotify_charts', 'in_apple_playlists',
    'in_apple_charts', 'in_deezer_playlists', 'in_deezer_charts',
    'in_shazam_charts','mode','key'
]
df2 = df2.drop(columns=columns_to_drop, errors='ignore')
print("df2 shape after dropping columns:", df2.shape)
df2.head()

df2 shape after dropping columns: (953, 11)


Unnamed: 0,track_name,artist(s)_name,streams,bpm,danceability_%,valence_%,energy_%,acousticness_%,instrumentalness_%,liveness_%,speechiness_%
0,Seven (feat. Latto) (Explicit Ver.),"Latto, Jung Kook",141381703,125,80,89,83,31,0,8,4
1,LALA,Myke Towers,133716286,92,71,61,74,7,0,10,4
2,vampire,Olivia Rodrigo,140003974,138,51,32,53,17,0,31,6
3,Cruel Summer,Taylor Swift,800840817,170,55,58,72,11,0,11,15
4,WHERE SHE GOES,Bad Bunny,303236322,144,65,23,80,14,63,11,6


## Text Cleaning Function

In [11]:
def cleanup(text,lemmatize = True):
    """Cleans text by removing HTML, special characters, and applying lemmatization."""

    # Convert to string if it's a different type
    if isinstance(text, float) or isinstance(text, np.int64):
        text = str(text)

    # Parse and clean HTML content
    soup = BeautifulSoup(text, "lxml")
    text = soup.get_text()

    # Remove non-letter characters and convert to lowercase
    text = re.sub(r"[^A-Za-z]", " ", text)
    text = text.lower()

    tokens = text.split()
    processed_text = []
    lemmatizer = WordNetLemmatizer()

    def get_wordnet_pos(tag):
        if tag.startswith('J'):
            return wordnet.ADJ
        elif tag.startswith('V'):
            return wordnet.VERB
        elif tag.startswith('N'):
            return wordnet.NOUN
        elif tag.startswith('R'):
            return wordnet.ADV
        return wordnet.NOUN  # Default to noun
    if lemmatize:
        tagged_tokens = pos_tag(tokens)  # Part-of-speech tagging
        for word, tag in tagged_tokens:
            processed_text.append(lemmatizer.lemmatize(word, get_wordnet_pos(tag)))
    else:
      processed_text = tokens

    return " ".join(processed_text)

# Example usage:
sample_text = "<p>This is an example sentence with HTML tags, numbers (123) and special characters!</p>"
cleaned_text = cleanup(sample_text)
print(cleaned_text)

this be an example sentence with html tag number and special character


In [12]:
print("Starting text cleaning and lemmatization...")
df['cleaned_text'] = df['text'].apply(lambda x: cleanup(x, lemmatize=True))
print("Text cleaning and lemmatization complete.")


Starting text cleaning and lemmatization...
Text cleaning and lemmatization complete.


In [81]:
def remove_special_chars_from_columns(df, columns):
    """Removes special characters from specified columns in a DataFrame."""
    for col in columns:
        if col in df.columns:
            df[col] = df[col].astype(str).apply(lambda x: re.sub(r'[^A-Za-z0-9\s]+', '', x))
    return df

columns_to_clean_df2 = ['track_name', 'artist(s)_name']

df2 = remove_special_chars_from_columns(df2, columns_to_clean_df2)

print("df2 after cleaning special characters:")
print(df2.head())

df2 after cleaning special characters:
                      track_name   artist(s)_name      streams  bpm  \
0  Seven feat Latto Explicit Ver  Latto Jung Kook  141381703.0  125   
1                           LALA      Myke Towers  133716286.0   92   
2                        vampire   Olivia Rodrigo  140003974.0  138   
3                   Cruel Summer     Taylor Swift  800840817.0  170   
4                 WHERE SHE GOES        Bad Bunny  303236322.0  144   

   danceability_%  valence_%  energy_%  acousticness_%  instrumentalness_%  \
0              80         89        83              31                   0   
1              71         61        74               7                   0   
2              51         32        53              17                   0   
3              55         58        72              11                   0   
4              65         23        80              14                  63   

   liveness_%  speechiness_%  
0           8              4  
1  

## TF-IDF Vectorization

In [13]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(df['cleaned_text'])
print("TF-IDF matrix created with shape:", tfidf_matrix.shape)
print("Sample cleaned text after re-processing:", df['cleaned_text'].iloc[0][:200])
print("Vocabulary size:", len(tfidf_vectorizer.vocabulary_))

TF-IDF matrix created with shape: (57650, 71338)
Sample cleaned text after re-processing: look at her face it s a wonderful face and it mean something special to me look at the way that she smile when she see me how lucky can one fellow be she s just my kind of girl she make me feel fine w
Vocabulary size: 71338


## Nearest Neighbors Model (Text-based)

In [14]:
# Initialize NearestNeighbors model
n_neighbors = 5
nn_model = NearestNeighbors(n_neighbors=n_neighbors + 1, algorithm='brute', metric='cosine')

# Fit the model to the TF-IDF matrix.
# The model learns how to find the nearest neighbors within this matrix.
print("Fitting NearestNeighbors model...")
nn_model.fit(tfidf_matrix)
print("NearestNeighbors model fitted.")

Fitting NearestNeighbors model...
NearestNeighbors model fitted.


## Scaling Numerical Features and Nearest Neighbors Model (Feature-based)

In [82]:
features_for_similarity_df2 = [
    'streams', 'danceability_%', 'energy_%', 'acousticness_%',
    'speechiness_%', 'instrumentalness_%', 'liveness_%', 'valence_%'
]

# Convert percentage columns from object (likely string) to numeric
for col in ['danceability_%', 'energy_%', 'acousticness_%', 'speechiness_%',
            'instrumentalness_%', 'liveness_%', 'valence_%']:
     # Remove '%' and convert to float, then divide by 100 if they are percentages
     if col in df2.columns and df2[col].dtype == 'object':
         df2[col] = df2[col].astype(str).str.replace('%', '', regex=False)
         df2[col] = pd.to_numeric(df2[col], errors='coerce')
         df2[col] = df2[col] / 100.0

# Convert 'streams' column to numeric, handling potential non-numeric values
if 'streams' in df2.columns:
    df2['streams'] = pd.to_numeric(df2['streams'], errors='coerce')

# Handle any remaining NaNs created by coercing errors
df2 = df2.dropna(subset=features_for_similarity_df2)
print("df2 shape after handling NaNs:", df2.shape)

# Select the feature data
features_matrix_df2 = df2[features_for_similarity_df2].values

df2 shape after handling NaNs: (952, 11)


In [83]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled_numerical_features = scaler.fit_transform(features_matrix_df2)
print("Scaled numerical features shape:", scaled_numerical_features.shape)

# Train a NearestNeighbors model on the features
n_neighbors_df2 = 5
nn_model_df2 = NearestNeighbors(n_neighbors=n_neighbors_df2 + 1, algorithm='brute', metric='cosine')
nn_model_df2.fit(scaled_numerical_features)

Scaled numerical features shape: (952, 8)


# Recommendation Functions

In [84]:
# Function to get song recommendations with similarity scores for evaluation
def get_recommendations_for_evaluation(song_title, df, nn_model, tfidf_matrix):
    try:
        song_index = df[df['song'] == song_title].index[0]
    except IndexError:
        return None, None

    song_vector = tfidf_matrix[song_index]

    distances, indices = nn_model.kneighbors(song_vector)

    recommended_song_indices = indices.flatten()[1:]
    similarity_scores = 1 - distances.flatten()[1:]

    recommended_songs_info = []
    for i, index in enumerate(recommended_song_indices):
        recommended_songs_info.append({
            'title': df.iloc[index]['song'],
            'artist': df.iloc[index]['artist'],
            'similarity': similarity_scores[i]
        })
    return recommended_songs_info, song_title

In [85]:
# Function to get song recommendations with similarity scores for evaluation (Feature-based)
def get_feature_based_recommendations_for_evaluation(song_title, df2, nn_model_df2, scaled_numerical_features):
    try:
        song_index = df2[df2['track_name'] == song_title].index[0]
    except IndexError:
        print(f"'{song_title}' not found in the feature-based dataset.")
        return None, None

    song_features = scaled_numerical_features[song_index].reshape(1, -1) # Reshape for prediction

    distances, indices = nn_model_df2.kneighbors(song_features)

    recommended_song_indices = indices.flatten()[1:]
    similarity_scores = 1 - distances.flatten()[1:]

    recommended_songs_info = []
    for i, index in enumerate(recommended_song_indices):
        recommended_songs_info.append({
            'title': df2.iloc[index]['track_name'],
            'artist': df2.iloc[index]['artist(s)_name'],
            'similarity': similarity_scores[i]
        })

    return recommended_songs_info, song_title

In [86]:
def recommender(song_title):
    # Try text-based recommendation first
    recommendations, query_song = get_recommendations_for_evaluation(
        song_title, df, nn_model, tfidf_matrix
    )

    if recommendations:
        print(f"\nRecommendations for '{query_song}' (Text-based):")
        for rec in recommendations:
            print(f"- '{rec['title']}' by {rec['artist']} (Similarity: {rec['similarity']:.4f})")
    else:
        # If text-based fails, try feature-based
        print(f"\n'{song_title}' not found in text-based dataset. Trying feature-based recommendations.")
        feature_based_recommendations, query_song_feature = get_feature_based_recommendations_for_evaluation(
            song_title, df2, nn_model_df2, scaled_numerical_features
        )
        if feature_based_recommendations:
            print(f"\nRecommendations for '{query_song_feature}' (Feature-based):")
            for rec in feature_based_recommendations:
                print(f"- '{rec['title']}' by {rec['artist']} (Similarity: {rec['similarity']:.4f})")
        else:
            print(f"\n'{song_title}' not found in either dataset.")

### Testing

In [87]:
recommender("Bang")
recommender("LET GO")


Recommendations for 'Bang' (Text-based):
- 'Happy Song' by Otis Redding (Similarity: 0.7021)
- 'Sea Of Dreams' by Electric Light Orchestra (Similarity: 0.6987)
- 'Let's Go Steady Again' by Neil Sedaka (Similarity: 0.5969)
- 'The Prime Of Your Love' by Billy Joel (Similarity: 0.5856)
- 'Bang-A-Boomerang' by ABBA (Similarity: 0.5491)

'LET GO' not found in text-based dataset. Trying feature-based recommendations.

Recommendations for 'LET GO' (Feature-based):
- 'Flowers' by Lauren Spencer Smith (Similarity: 0.9491)
- 'I Tried to Tell Yall' by Ugly Dray Tesla Jnr (Similarity: 0.9314)
- 'Andrea' by Buscabulla Bad Bunny (Similarity: 0.9271)
- 'Die Young feat 347aidan' by Sleepy hallow 347aidan (Similarity: 0.9252)
- 'MIENTRAS ME CURO DEL CORA' by Karol G (Similarity: 0.9170)
