# Information Retrieval - Spotify Song Recommender


### IMPORTS

In [3]:
# pip install nltk

Defaulting to user installation because normal site-packages is not writeable
Collecting nltk
  Using cached nltk-3.8.1-py3-none-any.whl (1.5 MB)
Collecting joblib
  Using cached joblib-1.2.0-py3-none-any.whl (297 kB)
Collecting regex>=2021.8.3
  Downloading regex-2022.10.31-cp39-cp39-macosx_11_0_arm64.whl (287 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m287.2/287.2 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: regex, joblib, nltk
[0mSuccessfully installed joblib-1.2.0 nltk-3.8.1 regex-2022.10.31
Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pa
import re
import nltk
nltk.download('punkt')
from nltk import word_tokenize,sent_tokenize
import operator
import numpy as np
from nltk.stem import PorterStemmer

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/admin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### PREPROCESSING

In [2]:
query = ''

In [3]:
# Get the Spotify song text list as Pandas DataFrame.
songs = pa.read_csv('spotify_millsongdata.csv')

# Add a new column for the filtered song texts.
songs.insert(loc=len(songs.columns), column='corpus', value='')

# Show the DataFrame
songs.head()

Unnamed: 0,artist,song,link,text,corpus
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA...",
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen...",
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...,
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...,
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...,


In [4]:
#Read Corpus (only for Output)
def readCorpus(filename):
        #List of lyrics of all songs      
        lyrics_dict = {str(x) + ' ' + filename['artist'][x] + ', ' + filename['song'][x]:filename['text'][x] for x in range(len(filename['text']))}
        lyrics_list = list(lyrics_dict.items())
        return lyrics_list
    
corpus = readCorpus(songs)

In [5]:
# Predefinition 
# Read a file as stopword list

def ReadStopwords(filename):
    stopwords = set()

    with open(filename) as f:
            content = f.readlines()

    for c in content:
            stopwords.add(c.strip())

    print("ReadStopwords: Return {} stop words.".format(len(stopwords)))
    return stopwords

In [6]:
# Predefinition 
# Filter with a stopword list

def FilterStopwords(input_text, stopwords=set(), no_stopwords=set(), stopword_feedback=False):

    output_text = ''

    # Cleans the text from control & punctuation characters.
    input_text = re.sub(r'\W+', ' ', input_text)

    # Transform upper case to lower case letters.
    input_text = input_text.lower()

    # Check terms for stopword
    terms = input_text.split(' ')
    for term in terms:
        if term not in stopwords:
            # Extended Stopword Check
            if stopword_feedback:
                if len(term) < 4 and term not in no_stopwords:
                    print("FilterStopwords: Recommend to add <{}> to the stopword list.".format(term))
                    tmp = input(">> Would you like to add (y/n/q)?  >>")

                    if tmp == 'y':
                        stopwords.add(term.strip())
                    elif tmp == 'q':
                        # Abort feedback.
                        stopword_feedback = False
                        output_text += term + ' '
                    else:
                        no_stopwords.add(term.strip())
                        output_text += term + ' '
                else:
                    output_text += term + ' '
            else:
                output_text += term + ' '

    return output_text, stopword_feedback

In [7]:
# Filter the song textes with the stopword list
# additional option is to us the Function FilterStopwords() with stopword_feedback = True to extend the stopword list
stopword_feedback = False

stopwords       = set()
no_stopwords    = set()

# First get the stopword list
stopwords = ReadStopwords('stopwords.txt')

# Execute on every
for i in range(songs.shape[0]):
    tmp, stopword_feedback = FilterStopwords(songs.loc[i]['text'], stopwords, no_stopwords, stopword_feedback)
    songs.loc[i]['corpus'] = tmp

# Show the DataFrame
songs.head()

ReadStopwords: Return 134 stop words.


Unnamed: 0,artist,song,link,text,corpus
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA...",look face wonderful face means something speci...
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen...",take easy please touch gently like summer even...
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...,never know put lousy rotten show boy tough pac...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...,making somebody happy question give take learn...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...,making somebody happy question give take learn...


In [8]:
# Tokenizing

def tokenize(column):
    """Tokenizes a Pandas dataframe column and returns a list of tokens.

    Args:
        column: Pandas dataframe column (i.e. df['text']).

    Returns:
        tokens (list): Tokenized list, i.e. [Donald, Trump, tweets]

    """

    tokens = nltk.word_tokenize(column)
    return [w for w in tokens if w.isalpha()]  

# Execute
songs['tokenized'] = songs.apply(lambda x: tokenize(x['corpus']), axis=1)
songs[['corpus', 'tokenized']].head()

Unnamed: 0,corpus,tokenized
0,look face wonderful face means something speci...,"[look, face, wonderful, face, means, something..."
1,take easy please touch gently like summer even...,"[take, easy, please, touch, gently, like, summ..."
2,never know put lousy rotten show boy tough pac...,"[never, know, put, lousy, rotten, show, boy, t..."
3,making somebody happy question give take learn...,"[making, somebody, happy, question, give, take..."
4,making somebody happy question give take learn...,"[making, somebody, happy, question, give, take..."


In [9]:
# Stemming

def stemming(tokenized_column):
    """Return a list of tokens with Porter stemming applied.

    Args:
        column: Pandas dataframe column of tokenized data with stopwords removed.

    Returns:
        tokens (list): Tokenized list with words Porter stemmed.

    """

    stemmer = PorterStemmer() 
    return [stemmer.stem(word) for word in tokenized_column]

# execute
songs['stemmed'] = songs.apply(lambda x: stemming(x['tokenized']), axis=1)
songs[['corpus','tokenized', 'stemmed']].head()

Unnamed: 0,corpus,tokenized,stemmed
0,look face wonderful face means something speci...,"[look, face, wonderful, face, means, something...","[look, face, wonder, face, mean, someth, speci..."
1,take easy please touch gently like summer even...,"[take, easy, please, touch, gently, like, summ...","[take, easi, pleas, touch, gentli, like, summe..."
2,never know put lousy rotten show boy tough pac...,"[never, know, put, lousy, rotten, show, boy, t...","[never, know, put, lousi, rotten, show, boy, t..."
3,making somebody happy question give take learn...,"[making, somebody, happy, question, give, take...","[make, somebodi, happi, question, give, take, ..."
4,making somebody happy question give take learn...,"[making, somebody, happy, question, give, take...","[make, somebodi, happi, question, give, take, ..."


### SEARCH TERM INPUT

In [13]:
#stemming the query

def stemthequery(tmp):
    query = []

    stemmer = PorterStemmer() 

    # Filter like the song textes
    tmp = re.sub(r'\W+', ' ', tmp)
    tmp = tmp.lower()

    terms = tmp.split(' ')
    for term in terms:
        query.append(term)

    # Add Stemming to the query    
    for ind, word in enumerate(query):
        query[ind] = stemmer.stem(word)

    return query

### RANKING MECHNISME

In [14]:
# Defines
SCORE_MAX = 10

In [15]:
# Transform the DataFrame Column "corpus" to a list of lists
#index = list()

#for i in range(songs.shape[0]):
#    tmp = songs.loc[i]['corpus']
#    terms = tmp.split(' ')
#    tmp_lst = list()
#    for term in terms:
#        tmp_lst.append(term)
#    index.append(tmp_lst.copy())

#print("Index Size = {}".format(len(index)))

In [16]:
# Set stemmed song texts as index & make a list of lists
index=songs['stemmed'].tolist()
print("Index Size = {}".format(len(index)))

Index Size = 57650


#### R1 : Terms

In [35]:
# Calculation and return of the ranking score no. 1
# The more search terms appear in a song text, the more relevant the song is.
def calculateScore1(index, query):
    scores = []
    for ind in index:
        score = 0
        for t in query:
            for dt in ind:
                if t == dt:
                    score = score + 1
                    break
        scores.append(score)
        
    #Normalise
    SCORE_MAX = 50
    factor = max(scores) / SCORE_MAX
    if factor>0:
        for i in range(len(scores)):
            scores[i] = scores[i] / factor
        
    return scores

r1_score = calculateScore1(index, query)

print("Maximum Score     = {}".format(max(r1_score)))
print("Average Score     = {:.2}".format(sum(r1_score)/len(r1_score)))
max_r1_index = r1_score.index(max(r1_score))
print(songs.iloc[max_r1_index])
print(songs.loc[max_r1_index,'corpus'])

Maximum Score     = 50.0
Average Score     = 1.2
artist                                                    ABBA
song                                            Happy New Year
link                      /a/abba/happy+new+year_20002925.html
text         No more champagne  \r\nAnd the fireworks are t...
corpus       champagne fireworks feeling lost feeling blue ...
tokenized    [champagne, fireworks, feeling, lost, feeling,...
stemmed      [champagn, firework, feel, lost, feel, blue, e...
Name: 27, dtype: object
champagne fireworks feeling lost feeling blue end party morning seems grey unlike yesterday time us say happy new year happy new year may vision world every neighbor friend happy new year happy new year may hopes try might well lay die sometimes see brave new world arrives see thrives ashes lives yes man fool thinks okay dragging feet clay never knowing astray keeps going anyway happy new year happy new year may vision world every neighbor friend happy new year happy new year may h

#### R2 : Frequency

In [41]:
# Calculation and return of the ranking score no. 2
# The more often the search terms appear in adocument, the more relevant the document is.

def calculateScore2(index, query):
    scores = []
    for ind in index:
        score = 0
        for t in query:
            for dt in ind:
                if t == dt:
                    score = score + 1
        scores.append(score)
        
    #Normalise
    SCORE_MAX = 30
    factor = max(scores) / SCORE_MAX
    if factor>0:
        for i in range(len(scores)):
            scores[i] = scores[i] / factor
    return scores

r2_score = calculateScore2(index, query)

print("Maximum Score     = {}".format(max(r2_score)))
print("Average Score     = {:.2}".format(sum(r2_score)/len(r2_score)))

max_r2_index = r2_score.index(max(r2_score))
print(songs.iloc[max_r2_index])
print(songs.loc[max_r2_index,'corpus'])

Maximum Score     = 30.000000000000004
Average Score     = 0.44
artist                                                 Madonna
song                                              It's So Cool
link                      /m/madonna/its+so+cool_20853254.html
text         It's so hard to be someone,  \r\nIt's so hard ...
corpus       hard someone hard unkwown heart blame heart pl...
tokenized    [hard, someone, hard, unkwown, heart, blame, h...
stemmed      [hard, someon, hard, unkwown, heart, blame, he...
Name: 12310, dtype: object
hard someone hard unkwown heart blame heart play game realize pay price know cost vice save soul test save soul best need love love love love love love need love love love love love love need love love love love love love need love love love love love love save soul little sister save soul dear little brother need hope says preacher learn cope says teachers really know god intention ever ask need love love love need need love love love love love love need love lov

#### R3 : Rareness

In [44]:
# - Calculation and return of the ranking score no. 3
# - The rarer the term is in the whole document, the higher it is in the ranking score no 3
# - For each Term that is found a score is calculated and then they are added. The rarer a single term
#   the higher its value
# - The score is then normalized, so that the index with the highest total value receives a 10


def calculateScore3(index, query):

#Platzhalter    
    scores = []
    found = []
    rel = []
    freq = []
    tot=0
    qterms = query

# gefundene Terme
    for ind in index:
        templist=[]
        for t in qterms:
            for dt in ind:
                if t == dt:
                    templist.append(t)
                    break
        found.append(templist)

# Häufigkeit der gefundenen Suchterme
    for t in qterms:
        score=0
        for x in index:
            for y in x:
                if t==y:
                    score=score+1
        freq.append(score)
        
# Total Anzahl Terme      
    for x in index:
        for y in x:
            tot=tot+len(x)
        
# Relative Häufigkeit der Suchterme
    for t in freq:
        score=1/(t+1/tot+1)
        rel.append(score)
        
# Scores berechnen für jeden Indexwert        
    for f in found:
        temp=0
        for e, t in enumerate(qterms):
            for i in f:
                if t == i:
                    temp += rel[e]
                    break
        scores.append(temp)
    
    #Normalise
    SCORE_MAX = 10
    factor = max(scores) / SCORE_MAX
    if factor>0:
        for i in range(len(scores)):
            scores[i] = scores[i] / factor
      
    return scores

r3_score = calculateScore3(index, query)

print("Maximum Score     = {}".format(max(r3_score)))
print("Average Score     = {:.2}".format(sum(r3_score)/len(r3_score)))   
    
max_r3_index = r3_score.index(max(r3_score))
print(songs.iloc[max_r3_index])
print(songs.loc[max_r3_index,'corpus'])

Maximum Score     = 10.0
Average Score     = 0.0025
artist                                              Beach Boys
song                                            Chapel Of Love
link                /b/beach+boys/chapel+of+love_20013595.html
text         Going to the chapel  \r\nAnd we're gonna get m...
corpus       going chapel gonna get married going chapel go...
tokenized    [going, chapel, gon, na, get, married, going, ...
stemmed      [go, chapel, gon, na, get, marri, go, chapel, ...
Name: 24624, dtype: object
going chapel gonna get married going chapel gonna get married gee really love gonna get married going chapel love bow bow bow bow bow bow bow bow bow bow spring ooo ooo ooo sky blue sky blue birds sing birds sing like yeah yeah yeah yeah yeah today day wah hooo oooo say ooo oooo never lonely anymore bow bow bow bow bow bow bow bow bow bow going chapel gonna get married going chapel gonna get married gee really love gonna get married going chapel love bow bow bow bow bow bow b

In [45]:
query

['i', 'love', 'lamp']

#### R4 : Distance

In [43]:
def calculateScore4(index, query):
    scores = []
    for ind in index:
        score = 0

        # Get the positions of the Query-Terms.
        # !!!  index 0 will be position 1  !!!
        positions = []
        for i in range(len(query)):
            cnt = 0
            pos = []
            #terms = ind.split(' ')
            #for term in terms:
            for term in ind:
                cnt += 1
                if term == query[i]:
                    pos.append(cnt)
            
            if len(pos) == 0:
                pos.append(0)
            positions.append(pos)
        
        if len(positions) == 0:
            # no positions, no distances, 
            score = 0
            scores.append(score)
            continue

        # Print Positions        
        # for i in range(len(positions)):
        #     print(positions[i])
        

        # Get the distances from the positions.
        # !!! use 0 as sign for No-Distance !!!
        distances   = []
        abort       = False
        i           = 0
        while(not abort):
            if positions[i][0] == 0:
                i += 1
                dst = [0]
                distances.append(dst)
                if i == len(positions):
                    #print(abort)
                    abort = True
                continue
            
            for j in range(len(positions) - i - 1):
                dst = []
                for k in range(len(positions[i])):
                    for l in range(len(positions[j+i+1])):
                        if positions[i+j+1][l] == 0:
                            pass
                        else:
                            dst.append( abs(positions[i][k] - positions[i+j+1][l]) )
                
                if len(dst) == 0:
                    dst.append(0)
                distances.append(dst)
            
            i += 1
            if i == len(positions):
                #print(abort)
                abort = True

        # Print Distances        
        # for i in range(len(distances)):
        #         print(min(distances[i]), distances[i])


        # Calculate the score from the distances.
        # !! max score = amount of Query-Terms !!
        score = 0.0
        for i in range(len(distances)):
            if distances[i][0] == 0:
                pass
            else:
                score += 1/min(distances[i])
        
            # print(score)

        # print()
        # print("Maximum Score     = {}".format(len(query)))

        # Normalize to Score [0,SCORE_MAX]
        #SCORE_MAX = 10
       # factor = len(query) / SCORE_MAX
        #score = score / factor

        # print("Maximum Score (N) = {}".format(score))

        scores.append(score)
        
    #Normalise
    SCORE_MAX = 10
    factor = max(scores) / SCORE_MAX
    if factor>0:
        for i in range(len(scores)):
            scores[i] = scores[i] / factor

    return scores

r4_score = calculateScore4(index, query)

print("Maximum Score     = {}".format(max(r4_score)))
print("Average Score     = {:.2}".format(sum(r4_score)/len(r4_score)))

max_r4_index = r4_score.index(max(r4_score))
print(songs.iloc[max_r4_index])
print(songs.loc[max_r4_index,'corpus'])

Maximum Score     = 10.0
Average Score     = 0.0016
artist                                          Paul McCartney
song                                    Don't Be Careless Love
link         /p/paul+mccartney/dont+be+careless+love_201056...
text         Burn the midnight lamp  \r\nDown until the daw...
corpus       burn midnight lamp dawn keep watch sure coming...
tokenized    [burn, midnight, lamp, dawn, keep, watch, sure...
stemmed      [burn, midnight, lamp, dawn, keep, watch, sure...
Name: 47587, dtype: object
burn midnight lamp dawn keep watch sure coming home shadows play flicker bedroom wall turn bad dream overnight something could terribly wrong careless love careless love careless dream running nowhere every step ve taken turns glue walking spiral staircase failing failing careless love careless love lamp burns getting pretty tired feel bad something might going amiss won look getting deep whatever back sleep careless love careless love careless saw face morning paper saw body

#### R5 : Chrono

In [38]:
#   - relative anstatt absolute Position
#   - alle gefundenen Suchbegriffe werden berücksichtigt und die Ergebnisse gemittelt

def calculateScore5(index, query):
    scores = []
    qterms = query                                     
    
    for ind in index:
        nterms=0
        score = 0
        for t in qterms:
            for pos, dt in enumerate(ind):
                rel_pos=(pos)/(len(ind))            
                if t == dt:
                    score += 10-((rel_pos)*10)
                    nterms +=1
                    break
        if nterms != 0:
            fsc = (np.mean(score)/nterms)
        else:
            fsc = 0
        scores.append(fsc)
    
    #Normalise
    SCORE_MAX = 5
    factor = max(scores) / SCORE_MAX
    if factor>0:
        for i in range(len(scores)):
            scores[i] = scores[i] / factor
    return scores

r5_score = calculateScore5(index, query)

print("Maximum Score     = {}".format(max(r5_score)))
print("Average Score     = {:.2}".format(sum(r5_score)/len(r5_score)))

max_r5_index = r5_score.index(max(r5_score))
print(songs.iloc[max_r5_index])
print(songs.loc[max_r5_index,'corpus'])

Maximum Score     = 10.0
Average Score     = 0.14
artist                                             Cat Stevens
song                                             School Is Out
link                /c/cat+stevens/school+is+out_20028140.html
text         School is out now we're gonna have some fun.  ...
corpus       school gonna fun gonna make like ones gonna bu...
tokenized    [school, gon, na, fun, gon, na, make, like, on...
stemmed      [school, gon, na, fun, gon, na, make, like, on...
Name: 2280, dtype: object
school gonna fun gonna make like ones gonna buy piece sky sell passin clouds gonna mayor small town tax crowds gonna local surveyor study foreman girl gonna line finest girls world comin school today happy yes alive gonna best day best day lives gonna history maker call richard third gonna buy amplifier shout heard comin school today happy yes alive gonna best day best day lives comin school today baby comin school today hey comin school today school school 


### RATING MECHANISME

In [22]:
# Calculate all 5 Rating scores. Final Score is the multiplication of all 5 scores.
def calculateScores(index, query):
    tmp = [calculateScore1(index, query), calculateScore2(index, query), calculateScore3(index, query), calculateScore4(index, query), calculateScore5(index, query)]
    final_scores = {}
    for p, ind in enumerate(index):
        final_scores[p] = tmp[0][p] + tmp[1][p] + tmp[2][p] + tmp[3][p] + tmp[4][p]
    return final_scores

In [23]:
# Create the output
def evaluateScores(scores, corpus, topn):
        sorted_x = sorted(scores.items(), key=operator.itemgetter(1), reverse=True)
        topnitems = sorted_x[0:topn]
        result = ''
        for item in topnitems:
                if item[1] > 0.0:
                        result = result + '\n**********' + corpus[item[0]][0] + '**********\n\n' + \
                        corpus[item[0]][1].strip() + \
                        ' --> Score: ' + str(item[1]) + '\n' + \
                        'R1: ' + str(r1_score[item[0]]) + \
                        ' , R2: ' + str(r2_score[item[0]]) + \
                        ' , R3: ' + str(r3_score[item[0]]) + \
                        ' , R4: ' + str(r4_score[item[0]]) + \
                        ' , R5: ' + str(r5_score[item[0]])
        return result

### RESULTS

In [48]:
#Inputs
q = input()
query = stemthequery(q)
query_str = ' '.join(query)
number_of_results = 3

In [49]:
#Search Result
r1_score = calculateScore1(index, query)
r2_score = calculateScore2(index, query)
r3_score = calculateScore3(index, query)
r4_score = calculateScore4(index, query)
r5_score = calculateScore5(index, query)

results_found = len(list(filter(lambda score: score>0, list(calculateScores(index, query).values()))))
if results_found>0:
    print('Found results: {}\n{} Top Songs when looking for "{}" are:\n {} '.format(results_found, number_of_results, q, evaluateScores(calculateScores(index, query), corpus, number_of_results)))
else:
    print('No results found for {}'.format(query_str))


Found results: 13204
3 Top Songs when looking for "forever hand summer fish" are:
 
**********38435 Johnny Cash, As Long As The Grass Shall Grow**********

As long as the moon shall rise as long as the rivers flow  
As long as the sun will shine as long as the grass shall grow  
The Senecas are an Indian tribe of the Iroquios nation  
Down on the New York Pennsylvania Line you'll find their reservation  
After the US revolution corn planter was a chief  
He told the tribe these men they could trust that was his true belief  
He went down to Independence Hall and there was a treaty signed  
That promised peace with the USA and Indian rights combined  
George Washington gave his signature the Government gave its hand  
They said that now and forever more that this was Indian land  
As long as the moon shall rise...  
On the Seneca reservation there is much sadness now  
Washington's treaty has been broken and there is no hope no how  
Across the Allegheny River they're throwing up a dam 