In [2]:
import pandas as pd
#pandas works with datasets
#numpy works with arrays
import numpy as np

In [3]:
df = pd.read_csv('songdata.csv')
df.head(3) #head returns first n rows

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \nAnd..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \nTouch me gentl..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \nWhy I had t...


In [4]:
df.shape #rows and columns

(57650, 4)

In [5]:
df = df.sample(n=5000).drop('link', axis=1).reset_index(drop=True)
#remove link column cuz its not required , and reset indexing

In [6]:
df.shape

(5000, 3)

In [7]:
df['song'][0]

'Our Day Will Come'

In [8]:
#cleaning for content based recommendation system

In [9]:
#here remove special characters and unncessary snippets to improve code quality
#this snippet replaces every thing which is not a word or a space to empty string, and \n also to an empty string , with regex=true stating that it is a regular expression
df['text'] = df['text'].str.lower().replace(r'[^\w\s]','').replace(r'\n',' ', regex=True)

In [10]:
df['text'][0]

"ooh, ohh yeah yeah, ohh   mmmm ohh, yeah      our day will come and we'll have everything   and ooh, we'll share the joy   falling in love can bring   and no one can tell me   that i'm too young to know   'cause i love you so ('cause i love you so)   and you love me yeah, ohh      our day will come if we just wait awhile   and ooh, no tears for us   make loving with a smile   and our dreams are meant to be   'cause we'll always stay   in love this way   our day, will come      our day will come   will come   our day will come (our day will come)   will come   our day will come   will come (ooh)   our day will come (yeah yeah yeah yeah)   will come      ooh, ohh yeah yeah, ohh   i said one day, our day, will come   one day, our day, will come   ooh, and nothing can stand in our way, oh   ohh, said nothing, nothing can, stand in our way      our day will come (ohh, ooh...)   will come   our day will come (our day will come, yeah)   will come      no one can change your mind   baby you a

In [11]:
#perform word tokenization(take individual words out of every sentence)
# eg love , loving , loved === love ( this is steming)

In [12]:
import nltk
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

def tokenization(txt):
    tokens = nltk.word_tokenize(txt)
    stemming = [stemmer.stem(w) for w in tokens]
    return " ".join(stemming)

In [13]:
import nltk
nltk.download('punkt') #unsupervised training model , which trains unlabeled data

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\91971\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [14]:
df['text'] = df['text'].apply(lambda x: tokenization(x))

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
#cosine similarity finds distance. (x.y/|x|.|y|  )

In [16]:
tfidvector = TfidfVectorizer(analyzer='word',stop_words='english')
matrix = tfidvector.fit_transform(df['text'])
similarity = cosine_similarity(matrix)

In [17]:
matrix.shape

(5000, 17266)

In [18]:
similarity[0]

array([1.        , 0.04412955, 0.00322536, ..., 0.0637196 , 0.05993302,
       0.01614202])

In [19]:
df['song'][0]

'Our Day Will Come'

In [31]:
df[df['song']=='Our Day Will Come'].index[0]

0

In [21]:
list(enumerate(similarity[0]))
#this gives similarity of song[0] with all the other songs , which with itself gives 1

[(0, 1.0000000000000002),
 (1, 0.04412955179263321),
 (2, 0.0032253586733963077),
 (3, 0.12800467430026793),
 (4, 0.0687723785571569),
 (5, 0.041796081165579604),
 (6, 0.05161138615821893),
 (7, 0.0582419347653201),
 (8, 0.03024308545435708),
 (9, 0.023958007116568883),
 (10, 0.017767647623138073),
 (11, 0.05762566394627159),
 (12, 0.018628926657497595),
 (13, 0.07702532368320048),
 (14, 0.027548478031371917),
 (15, 0.0023526142069379137),
 (16, 0.0),
 (17, 0.02379169360972023),
 (18, 0.031101500020908935),
 (19, 0.017394792763944886),
 (20, 0.01278430685784655),
 (21, 0.016727284012442962),
 (22, 0.03100399797377902),
 (23, 0.1875848001577764),
 (24, 0.0327763993466907),
 (25, 0.004555759411074981),
 (26, 0.013762823289541428),
 (27, 0.050898369069297136),
 (28, 0.028682214548374566),
 (29, 0.042658404421243926),
 (30, 0.009313367160348878),
 (31, 0.023822604349244672),
 (32, 0.0825962508804324),
 (33, 0.08424242296625314),
 (34, 0.016087185871659675),
 (35, 0.0),
 (36, 0.023093439835

In [22]:
def recommendation(song_df):
    idx = df[df['song'] == song_df].index[0]
    #example provided above
    distances = sorted(list(enumerate(similarity[idx])),reverse=True,key=lambda x:x[1])
    
    songs = []
    for m_id in distances[1:11]:
        songs.append(df.iloc[m_id[0]].song)
        
    return songs

In [33]:
recommendation('Like I Never Left')

['All Shook Up',
 'Come Back',
 'Alphabet Street',
 'Miss Independent',
 'Nothing Left To Give',
 'What I Like About You',
 'Fly Away',
 'Get In Line',
 'Die In Your Arms',
 'Marianne']