In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('songdata.csv')

In [3]:
df.head(4)

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \nAnd..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \nTouch me gentl..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \nWhy I had t...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...


In [4]:
df = df.sample(n=2000).drop('link', axis=1).reset_index(drop=True)

In [5]:
df['text'][0]

"O Christmas Tree,  \nO Christmas Tree,  \nHow steadfast are  \nYour branches!  \nYour boughs are green  \nIn summer's clime  \nAnd through the snows  \nOf wintertime.  \nO Christmas Tree,  \nO Christmas Tree,  \nHow steadfast are  \nYour branches!  \n  \nO Christmas Tree,  \nO Christmas Tree,  \nWhat happiness befalls me  \nWhen oft at  \nJoyous Christmas-time  \nYour form inspires  \nMy song and rhyme.  \nO Christmas Tree,  \nO Christmas Tree,  \nWhat happiness befalls me  \n  \nO Christmas Tree,  \nO Christmas Tree,  \nYour boughs can  \nTeach a lesson  \nThat constant faith  \nAnd hope sublime  \nLend strength and  \nComfort through all time.  \nO Christmas Tree,  \nO Christmas Tree,  \nYour boughs can  \nTeach a lesson\n\n"

In [6]:
df['text'] = df['text'].str.lower().replace('\n','',regex=True)
df['text']

0       o christmas tree,  o christmas tree,  how stea...
1       you day breaks, your mind aches  you find that...
2       in your eyes i find the answers to my question...
3       wall's bricked with books  pages bricked with ...
4       in the heat of the fight i walked away  ignori...
                              ...                        
1995    when i was younger, so much younger than today...
1996    well i clung to you,  like cat hair clings to ...
1997    [chorus]  did you ever think that you would be...
1998    you held my hand and then you slipped away  an...
1999    (dolly parton)  busy signal  that's what i get...
Name: text, Length: 2000, dtype: object

In [7]:
import nltk 
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\satwa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [8]:
ps = PorterStemmer()

def tokenization(txt):
    tokens = word_tokenize(text=txt, language='english', preserve_line=True)

    for w in tokens:
        stemming = [ps.stem(w) for w in tokens]

    return " ".join(stemming)

In [9]:
df['text'] =  df['text'].apply(lambda x: tokenization(x))
df['text'][0]

"o christma tree , o christma tree , how steadfast are your branch ! your bough are green in summer 's clime and through the snow of wintertime. o christma tree , o christma tree , how steadfast are your branch ! o christma tree , o christma tree , what happi befal me when oft at joyou christmas-tim your form inspir my song and rhyme. o christma tree , o christma tree , what happi befal me o christma tree , o christma tree , your bough can teach a lesson that constant faith and hope sublim lend strength and comfort through all time. o christma tree , o christma tree , your bough can teach a lesson"

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [11]:
tfid = TfidfVectorizer(stop_words='english')
matrix = tfid.fit_transform(df['text'])

In [12]:
print(matrix)
matrix.shape

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 106844 stored elements and shape (2000, 11126)>
  Coords	Values
  (0, 1736)	0.6529965266567099
  (0, 10054)	0.5622993792259545
  (0, 9217)	0.16362125149877965
  (0, 1203)	0.15734746528854876
  (0, 1162)	0.21771500083412204
  (0, 4113)	0.053034273192133954
  (0, 9411)	0.05157824327247138
  (0, 1834)	0.086231828263504
  (0, 8904)	0.05228197305391171
  (0, 10857)	0.086231828263504
  (0, 4277)	0.08781430308580364
  (0, 853)	0.172463656527008
  (0, 6708)	0.08181062574938983
  (0, 5068)	0.08181062574938983
  (0, 1737)	0.07867373264427438
  (0, 9860)	0.07425253013016021
  (0, 3660)	0.060085113301657035
  (0, 4848)	0.06582164665982816
  (0, 8967)	0.037105371538672806
  (0, 7977)	0.057455475706248114
  (0, 9636)	0.11920082015869073
  (0, 5475)	0.13528641784177328
  (0, 2053)	0.07111563702504477
  (0, 3345)	0.048105815168313255
  (0, 4557)	0.0375354452525858
  :	:
  (1999, 2618)	0.1163683715854317
  (1999, 4651)	0.05062000278957626
  

(2000, 11126)

In [13]:
similarity = cosine_similarity(matrix)
similarity[4]

array([0.00123845, 0.15545086, 0.0858009 , ..., 0.05662949, 0.07414431,
       0.08656492])

In [51]:
distance = sorted(list(enumerate(similarity[5])), reverse=True, key=lambda x:x[1])
print(distance)

[(5, np.float64(1.0)), (1327, np.float64(0.3069831611273429)), (600, np.float64(0.27094989006399295)), (858, np.float64(0.2645382573846127)), (838, np.float64(0.24895208865579754)), (1213, np.float64(0.2351133563102676)), (326, np.float64(0.23133549599360423)), (574, np.float64(0.22336256105466748)), (846, np.float64(0.21055188483385562)), (645, np.float64(0.20936507504473942)), (337, np.float64(0.20923679911522922)), (1459, np.float64(0.1968322668395518)), (1067, np.float64(0.19624098661946698)), (1559, np.float64(0.19458327553354982)), (1958, np.float64(0.19236400307996976)), (224, np.float64(0.18385282336602365)), (608, np.float64(0.18343748613428906)), (1728, np.float64(0.18323460904993796)), (454, np.float64(0.18040797023025915)), (1142, np.float64(0.17993565374974735)), (1431, np.float64(0.17899278617707712)), (1273, np.float64(0.17719491916143276)), (1416, np.float64(0.17650255370064719)), (465, np.float64(0.17597078595026083)), (1835, np.float64(0.1739876248871271)), (1279, np.

In [44]:
def Recommendation(song):
    idx = df[df['song']==song].index[0]
    distance = sorted(list(enumerate(similarity[idx])), reverse=True, key=lambda x:x[1])
    songs = []
    for i in distance[1:21]:
        songs.append(df.iloc[i[0]].song)

    return songs


In [50]:
df.head(8)

Unnamed: 0,artist,song,text
0,Christmas Songs,O Christmas Tree,"o christma tree , o christma tree , how steadf..."
1,Paul McCartney,For No One,"you day break , your mind ach you find that al..."
2,Leann Rimes,Feels Like Home,in your eye i find the answer to my question i...
3,Widespread Panic,Mercy,wall 's brick with book page brick with word e...
4,Taylor Swift,The Other Side Of The Door,in the heat of the fight i walk away ignor wor...
5,Widespread Panic,Sometimes,like a locomot wheel feel real as steel heart ...
6,Status Quo,Cross That Bridge,i wa n't born in a lap delux i did n't have no...
7,Adele,Many Shades Of Black,go ahead go ahead and smash it on the floor ta...


In [49]:
Recommendation('Sometimes')

['If I Could Only Be Like You',
 "That's How Love Moves",
 "Ramblin' In My Shoes",
 'A New Machine',
 'Maybe',
 'Mama',
 'Hard 2 Face Reality',
 'Key To The Highway',
 'We Are',
 'A Message To The Wind',
 'Heart To Heart',
 'Out Of Sight',
 'Key To The Highway',
 "Rockin' Down The Highway",
 'I Know You Better Than You Think',
 'Taken By Sleep',
 "Foxy's Folk Faced",
 'How',
 'Lost Highway',
 "Let's Talk About Love"]