In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/spotify-million-song-dataset/spotify_millsongdata.csv


### First import all the required library 

In [2]:
import re
import pandas as pd 
import numpy as np 
from pprint import pprint 

from nltk.corpus import stopwords
stop_words=stopwords.words("english")

import gensim
import gensim.corpora as corpora 
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import spacy

import pyLDAvis
import pyLDAvis.gensim
%matplotlib inline
import matplotlib.pyplot as plt 

import logging
logging.basicConfig(format='%(asctime)s:%(levelname)s:(message)s',level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

  """


### Load and clean the data

In [3]:
df=pd.read_csv("/kaggle/input/spotify-million-song-dataset/spotify_millsongdata.csv")
df.head()

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [4]:
#create a user define function to clean the text

def clean_text(text):
    text=re.sub('RT','',text)                       #remove 'RT' from text
    text=re.sub('#[A-Za-z0-9]+','',text)            #remove hashtags from text
    text=re.sub('\\n','',text)                      #remove \n from the text
    text=re.sub('\\r','',text)                      #remove \n from the text
    text=re.sub('[^\w\s]','',text)                  #remove punctuation from the text
    text=re.sub('^[\s]+|[\s]+$','',text)            #remove leading & trailing whitespace from the text
    text=text.lower()
    
    return text

In [5]:
#applying the cleaning function on to the dataset

df['lyrics']=df['text'].apply(clean_text)
df.head()

Unnamed: 0,artist,song,link,text,lyrics
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA...",look at her face its a wonderful face and it ...
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen...",take it easy with me please touch me gently l...
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...,ill never know why i had to go why i had to p...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...,making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...,making somebody happy is a question of give an...


In [6]:
#inserting all the lyrics in a list

lyrics=df['lyrics'].values.tolist()
lyrics[0]

'look at her face its a wonderful face  and it means something special to me  look at the way that she smiles when she sees me  how lucky can one fellow be    shes just my kind of girl she makes me feel fine  who could ever believe that she could be mine  shes just my kind of girl without her im blue  and if she ever leaves me what could i do what could i do    and when we go for a walk in the park  and she holds me and squeezes my hand  well go on walking for hours and talking  about all the things that we plan    shes just my kind of girl she makes me feel fine  who could ever believe that she could be mine  shes just my kind of girl without her im blue  and if she ever leaves me what could i do what could i do'

### Tokenize words

In [7]:
#creating a user define function for tokenize the sentences

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence)))

In [8]:
#applying the tokenize function onto the list

lyrics_words=list(sent_to_words(lyrics))
lyrics_words[0]

['look',
 'at',
 'her',
 'face',
 'its',
 'wonderful',
 'face',
 'and',
 'it',
 'means',
 'something',
 'special',
 'to',
 'me',
 'look',
 'at',
 'the',
 'way',
 'that',
 'she',
 'smiles',
 'when',
 'she',
 'sees',
 'me',
 'how',
 'lucky',
 'can',
 'one',
 'fellow',
 'be',
 'shes',
 'just',
 'my',
 'kind',
 'of',
 'girl',
 'she',
 'makes',
 'me',
 'feel',
 'fine',
 'who',
 'could',
 'ever',
 'believe',
 'that',
 'she',
 'could',
 'be',
 'mine',
 'shes',
 'just',
 'my',
 'kind',
 'of',
 'girl',
 'without',
 'her',
 'im',
 'blue',
 'and',
 'if',
 'she',
 'ever',
 'leaves',
 'me',
 'what',
 'could',
 'do',
 'what',
 'could',
 'do',
 'and',
 'when',
 'we',
 'go',
 'for',
 'walk',
 'in',
 'the',
 'park',
 'and',
 'she',
 'holds',
 'me',
 'and',
 'squeezes',
 'my',
 'hand',
 'well',
 'go',
 'on',
 'walking',
 'for',
 'hours',
 'and',
 'talking',
 'about',
 'all',
 'the',
 'things',
 'that',
 'we',
 'plan',
 'shes',
 'just',
 'my',
 'kind',
 'of',
 'girl',
 'she',
 'makes',
 'me',
 'feel',
 '

### Making bigram, trigram, removing stopwords & lemmatizing the lyrics

In [9]:
#creating user define function for every part of task
bigram = gensim.models.Phrases(lyrics_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[lyrics_words], threshold=100)
bigram_mod=gensim.models.phrases.Phraser(bigram)
trigram_mod=gensim.models.phrases.Phraser(trigram)

#function to remove stopwords
def remove_stopwords(text):
    return[[word for word in simple_preprocess(str(doc)) if word not in stop_words]for doc in text]

#function to findout the bigram words
def make_bigrams(text):
    return[bigram_mod(doc) for doc in text]

#function to findout the trigram words
def make_trigrams(text):
    return[trigram_mod[bigram_mod(doc)] for doc in text]


def lemmatization(text,allowed_postags=['NOUN','ADJ','VERB','ADV']):
    texts_out=[]
    for sent in text:
        doc=nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out