In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity



In [2]:
# Load the dataset
df = pd.read_csv('spotify_millsongdata.csv')

# View the first few rows of the data
df.head()


FileNotFoundError: [Errno 2] No such file or directory: 'spotify_millsongdata.csv'

In [None]:
import os
print(os.getcwd())


In [None]:
# Check for missing values
df.isnull().sum()


In [None]:
# Remove rows where lyrics (text) are missing
df = df.dropna(subset=['text'])


In [None]:
# Keep only the important columns and remove duplicates
df = df[['artist', 'song', 'text']]
df.drop_duplicates(inplace=True)

# Check the first few rows again
df.head()


In [None]:
# Use only the first 5000 rows to reduce memory usage
df_small = df.sample(5000, random_state=42)

# Re-create the TF-IDF matrix
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df_small['text'])

# Calculate cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)


In [None]:
# Reset the index for easier lookup
df = df.reset_index()

# Function to recommend songs
def recommend_song(title, cosine_sim=cosine_sim):
    # Find the index of the song in the dataframe
    idx = df[df['song'] == title].index[0]
    
    # Get similarity scores for that song
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort the songs based on similarity score (highest first)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Select top 5 most similar songs (excluding the song itself)
    sim_scores = sim_scores[1:6]
    
    # Get the indices of these similar songs
    song_indices = [i[0] for i in sim_scores]
    
    # Return the top 5 most similar songs
    return df[['artist', 'song']].iloc[song_indices]


In [None]:
# Replace 'Imagine' with any song from your dataset
recommend_song('Imagine')


In [None]:
import joblib
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')
joblib.dump(cosine_sim, 'cosine_similarity.pkl')


In [None]:
import pandas as pd

df = pd.read_csv("spotify_millsongdata.csv")

# Sirf pehle 10,000 rows rakhein
df_reduced = df.head(10000)

# Ya randomly 20% data sample karein
df_reduced = df.sample(frac=0.2, random_state=42)

# Reduced dataset ko save karein
df_reduced.to_csv("spotify_millsongdata_reduced.csv", index=False)

print("Dataset ka size kam ho gaya!")