In [1]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('C:\deskone\Desktop\musicapp\wargame\songdata.csv')
df.head(3)

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \nAnd..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \nTouch me gentl..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \nWhy I had t...


In [4]:
df.shape

(57650, 4)

In [5]:
df = df.sample(n=5000).drop('link', axis=1).reset_index(drop=True)

In [6]:
df.shape

(5000, 3)

In [7]:
df['text'][0]

"This world that I live in is empty and cold  \nThis loneliness cuts me and tears at my soul  \nI'm no child of destiny, no fortune's son  \nI've just chased you so long now, I'm too weak to run  \nSo here I return to a back street of thrills  \nBack to any warm shoulder till she's got her fill  \nAnd then I treat shame like an old friend from home  \nThat I can lean on till the misery is gone  \nA new day is here yet nothing is new  \nYou're still gone and I tremble for you  \nI cry out at bedtime no, please not tonight  \nBut again there's your footsteps and I turn on the light  \nOf course you're not there, no, you never are  \nThen I try to forget that there's always a bar  \nAnd well, I win that battle, yes, sometimes I do  \nBut sleep doesn't come when I tremble for you  \n\n"

In [8]:
 df['text'].str.lower().replace(r'[^\w\s]','').replace(r'\n',' ', regex=True)

0       this world that i live in is empty and cold   ...
1       from the very start   it came apart   it broke...
2       got to be there, got to be there in the mornin...
3       there was smoke in the air   when he came arou...
4       well, things were spinning round me   and all ...
                              ...                        
4995    i'm uneducated   my clothes outdated   i'm not...
4996    uh uh   i friend of mine was talking   i was e...
4997    there's no method in my madness, no craft, no ...
4998    oh, i'm so alone.   hey daddy, daddy, daddy, d...
4999    you want to know why you feel so hollow?   bec...
Name: text, Length: 5000, dtype: object

In [9]:
df['text'] = df['text'].str.lower().replace(r'[^\w\s]','').replace(r'\n',' ', regex=True)

In [10]:
df['text'][0]

"this world that i live in is empty and cold   this loneliness cuts me and tears at my soul   i'm no child of destiny, no fortune's son   i've just chased you so long now, i'm too weak to run   so here i return to a back street of thrills   back to any warm shoulder till she's got her fill   and then i treat shame like an old friend from home   that i can lean on till the misery is gone   a new day is here yet nothing is new   you're still gone and i tremble for you   i cry out at bedtime no, please not tonight   but again there's your footsteps and i turn on the light   of course you're not there, no, you never are   then i try to forget that there's always a bar   and well, i win that battle, yes, sometimes i do   but sleep doesn't come when i tremble for you    "

In [11]:
import nltk
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

def tokenization(txt):
    tokens = nltk.word_tokenize(txt)
    stemming = [ps.stem(w) for w in tokens]
    return " ".join(stemming)

In [12]:
tokenization('this is my loving loved')

'thi is my love love'

In [13]:
df['text'] = df['text'].apply(lambda x: tokenization(x))

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [15]:
tfid = TfidfVectorizer(stop_words='english')
matrix = tfid.fit_transform(df['text'])

In [16]:
matrix

<5000x16841 sparse matrix of type '<class 'numpy.float64'>'
	with 277232 stored elements in Compressed Sparse Row format>

In [17]:
matrix.shape

(5000, 16841)

In [18]:
similarity = cosine_similarity(matrix)

In [19]:
similarity

array([[1.        , 0.02501336, 0.07377327, ..., 0.0374211 , 0.05765081,
        0.04467158],
       [0.02501336, 1.        , 0.01472039, ..., 0.01326892, 0.02827992,
        0.03556619],
       [0.07377327, 0.01472039, 1.        , ..., 0.16846549, 0.2477248 ,
        0.06102839],
       ...,
       [0.0374211 , 0.01326892, 0.16846549, ..., 1.        , 0.07801444,
        0.02237985],
       [0.05765081, 0.02827992, 0.2477248 , ..., 0.07801444, 1.        ,
        0.09077228],
       [0.04467158, 0.03556619, 0.06102839, ..., 0.02237985, 0.09077228,
        1.        ]])

In [20]:
df['song'][0]

'I Tremble For You'

In [21]:
df[df['song']=='Desire']

Unnamed: 0,artist,song,text
2425,Zwan,Desire,northern star am i frighten where can i go to ...
2897,Bee Gees,Desire,"we may be big or small or black or white , ete..."


In [22]:
def recommendation(song):
    idx = df[df['song'] == song].index[0]
    distances = sorted(list(enumerate(similarity[idx])),reverse=False,key=lambda x:x[1])
    
    songs = []
    for i in distances[1:21]:
        songs.append(df.iloc[i[0]].song)
        
    return songs

In [24]:
recommendation('Desire')

["Now I'm A Farmer",
 'Judge Not',
 'I Gotta Right To Sing The Blues',
 'Dancing In The Streets',
 'Lady Stardust',
 'NO ONE IN THE WORLD',
 'Under The Moon Of Love',
 'It Only Happens When I Dance With You',
 'Merry Christmas Baby',
 'Brown Girl In The Ring',
 "The Swingin' Shepherd Blues",
 "Tell Me I'm Not Dreamin' (Too Good To Be True)",
 "El Meod Na'ala",
 "You're Losing Me",
 'Proud',
 'Coming Soon',
 "Don't Break This Rhythm",
 'Face Pollution',
 'Broadway Song',
 'Just Be A Woman']

In [25]:
import pickle
pickle.dump(similarity,open('similarity.pkl','wb'))
pickle.dump(df,open('df.pkl','wb'))