In [16]:

import pandas as pd
import spacy


In [17]:
#%% load data 
song_data = pd.read_csv("clean_file.csv")
df = pd.DataFrame(song_data)


In [18]:
# %% test 
df['processed_text'].iloc[0]

'i could feel time there way know fallen leav night who say theyr blow a free wind hope learn whi sea tide ha way turn more thi you know there noth more thi tell one thing more thi you know there noth it fun while there way know like dream night who say were go no care world mayb im learn whi sea tide ha way turn more thi you know there noth more thi tell one thing more thi you know there noth more thi you know there noth more thi tell one thing more thi there noth'

In [19]:
# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel 

In [20]:
#%%
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(df['processed_text'].iloc[:10000]))

#data_words

In [21]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

['could', 'feel', 'time', 'there', 'way', 'know', 'fallen', 'leav', 'night', 'who', 'say', 'theyr', 'blow', 'free', 'wind', 'hope', 'learn', 'whi', 'sea', 'tide', 'ha', 'way', 'turn', 'more', 'thi', 'you', 'know', 'there', 'noth', 'more', 'thi', 'tell', 'one', 'thing', 'more', 'thi', 'you', 'know', 'there', 'noth', 'it', 'fun', 'while', 'there', 'way', 'know', 'like', 'dream', 'night', 'who', 'say', 'were', 'go', 'no', 'care', 'world', 'mayb', 'im', 'learn', 'whi', 'sea', 'tide', 'ha', 'way', 'turn', 'more', 'thi', 'you', 'know', 'there', 'noth', 'more', 'thi', 'tell', 'one', 'thing', 'more', 'thi', 'you', 'know', 'there', 'noth', 'more', 'thi', 'you', 'know', 'there', 'noth', 'more', 'thi', 'tell', 'one', 'thing', 'more', 'thi', 'there', 'noth']


In [22]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
nlp = spacy.load("en_core_web_sm")

In [23]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out


In [24]:
 
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)
 
# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

 
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
 
# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

[['feel', 'time', 'way', 'know', 'fall', 'leav', 'night', 'say', 'theyr', 'blow', 'free', 'wind', 'hope', 'learn', 'whi', 'tide', 'turn', 'thi', 'know', 'tell', 'thing', 'thi', 'know', 'way', 'know', 'dream', 'night', 'go', 'care', 'world', 'm', 'learn', 'whi', 'tide', 'turn', 'thi', 'know', 'tell', 'thing', 'thi', 'know', 'know', 'tell', 'thing', 'thi']]


In [25]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized



In [26]:
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 7), (9, 2), (10, 1), (11, 1), (12, 2), (13, 1), (14, 3), (15, 1), (16, 5), (17, 3), (18, 2), (19, 1), (20, 2), (21, 2), (22, 2), (23, 1), (24, 1)]]


In [27]:
# %% word assigned to 0 
id2word[0]

'blow'

In [28]:
# %% print words and thier freq  
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('blow', 1),
  ('care', 1),
  ('dream', 1),
  ('fall', 1),
  ('feel', 1),
  ('free', 1),
  ('go', 1),
  ('hope', 1),
  ('know', 7),
  ('learn', 2),
  ('leav', 1),
  ('m', 1),
  ('night', 2),
  ('say', 1),
  ('tell', 3),
  ('theyr', 1),
  ('thi', 5),
  ('thing', 3),
  ('tide', 2),
  ('time', 1),
  ('turn', 2),
  ('way', 2),
  ('whi', 2),
  ('wind', 1),
  ('world', 1)]]

In [29]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [30]:
# Print the Keyword in the 20 topics
print(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0, '0.489*"get" + 0.168*"gon" + 0.071*"ai" + 0.036*"work" + 0.022*"well" + 0.021*"move" + 0.018*"everybodi" + 0.016*"fool" + 0.012*"floor" + 0.012*"comin"'), (1, '0.381*"time" + 0.148*"long" + 0.129*"wait" + 0.127*"run" + 0.085*"last" + 0.040*"goodby" + 0.016*"lover" + 0.010*"past" + 0.010*"piti" + 0.009*"shot"'), (2, '0.144*"soul" + 0.089*"noth" + 0.067*"nt" + 0.059*"blood" + 0.058*"follow" + 0.054*"beat" + 0.036*"babe" + 0.034*"bodi" + 0.033*"stone" + 0.033*"whatev"'), (3, '0.226*"blue" + 0.150*"burn" + 0.057*"flame" + 0.050*"wide" + 0.048*"summer" + 0.038*"window" + 0.033*"lock" + 0.032*"dig" + 0.031*"bind" + 0.031*"imagin"'), (4, '0.362*"come" + 0.099*"home" + 0.072*"call" + 0.064*"talk" + 0.048*"sing" + 0.039*"song" + 0.038*"bring" + 0.031*"town" + 0.021*"write" + 0.019*"bit"'), (5, '0.106*"night" + 0.078*"girl" + 0.054*"light" + 0.046*"rock" + 0.035*"woman" + 0.031*"kiss" + 0.030*"sweet" + 0.030*"roll" + 0.029*"side" + 0.025*"name"'), (6, '0.169*"watch" + 0.154*"rain" + 0.125*"