In [3]:
import pandas as pd

In [4]:
df = pd.read_csv("songdata.csv")

In [5]:
df =df.head(500).drop('link', axis=1).reset_index(drop=True)

In [6]:
df.head(10)

Unnamed: 0,artist,song,text
0,ABBA,She's My Kind Of Girl,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante","Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,Making somebody happy is a question of give an...
5,ABBA,Burning My Bridges,"Well, you hoot and you holler and you make me ..."
6,ABBA,Cassandra,Down in the street they're all singing and sho...
7,ABBA,Chiquitita,"Chiquitita, tell me what's wrong \r\nYou're e..."
8,ABBA,Crazy World,I was out with the morning sun \r\nCouldn't s...
9,ABBA,Crying Over You,I'm waitin' for you baby \r\nI'm sitting all ...


In [7]:
df['text'] = df['text'].str.lower().replace(r'^\w\s', ' ').replace(r'\n', ' ', regex = True)

In [8]:
import nltk
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

def tokenization(txt):
    tokens = nltk.word_tokenize(txt)
    stemming = [stemmer.stem(w) for w in tokens]
    return " ".join(stemming)

In [9]:
df['text'] = df['text'].apply(lambda x: tokenization(x))

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [11]:
tfidvector = TfidfVectorizer(analyzer='word',stop_words='english')
matrix = tfidvector.fit_transform(df['text'])
similarity = cosine_similarity(matrix)

In [12]:
similarity[0]

array([1.00000000e+00, 4.46464231e-02, 1.26675834e-02, 1.66822739e-02,
       1.70624128e-02, 8.54819087e-02, 5.29403834e-02, 1.48107674e-02,
       2.11856314e-01, 9.13381402e-02, 5.97336336e-02, 9.45592537e-02,
       4.84350364e-02, 1.47932401e-01, 1.20017970e-01, 2.90544277e-02,
       2.28297920e-02, 5.70764772e-02, 1.47668967e-02, 0.00000000e+00,
       3.91865345e-02, 3.12969847e-02, 3.61595456e-03, 9.40982726e-03,
       4.63697341e-02, 1.15983839e-01, 6.61183434e-02, 1.70039519e-02,
       1.79008422e-02, 1.09067683e-01, 2.51724939e-02, 7.11475363e-03,
       2.40180135e-02, 4.86517773e-02, 3.77124474e-01, 6.04983334e-02,
       1.37538063e-01, 8.98538693e-02, 2.47099194e-02, 4.88097301e-02,
       1.40951197e-01, 1.36935234e-01, 4.94587385e-02, 5.81889981e-02,
       1.15765660e-01, 4.87611978e-02, 1.28910063e-01, 1.31701453e-02,
       7.34316585e-03, 4.12729719e-02, 4.67155843e-02, 2.55287145e-02,
       3.33869544e-02, 3.34451681e-02, 2.15143866e-02, 2.08529507e-02,
      

In [13]:
df[df['song'] == 'Crying Over You']

Unnamed: 0,artist,song,text
9,ABBA,Crying Over You,i 'm waitin ' for you babi i 'm sit all alon i...


In [14]:
def recommendation(song_df):
    idx = df[df['song'] == song_df].index[0]
    distances = sorted(list(enumerate(similarity[idx])),reverse=True,key=lambda x:x[1])
    
    songs = []
    similarity_scores = []
    for m_id in distances[1:21]:
        songs.append(df.iloc[m_id[0]].song)
        similarity_scores.append(m_id[1])
        
    return songs, similarity_scores

In [17]:
import pickle
pickle.dump(similarity,open('similarity.pkl','wb'))
pickle.dump(df,open('df.pkl','wb'))