In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string
from gensim.models import LsiModel
from gensim import corpora
#from sklearn.feature_extraction.text import TfidfVectorizer
#from sklearn.decomposition import TruncatedSVD
#from sklearn.pipeline import Pipeline
from nltk.stem.porter import PorterStemmer
from gensim.models.coherencemodel import CoherenceModel
import matplotlib.pyplot as plt
#from dataclean import importing_Lyrics_df, pre_processing
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dharm\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\dharm\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [3]:
def importing_Lyrics_df():
    df = pd.read_csv('C:/Users/dharm/OneDrive - IMC/FH Krems/4th sem/datascience capstone/NLP_ML_PROJECT/data/raw/azlyrics_lyrics_19.csv',on_bad_lines='skip')
    return df

In [4]:
def pre_processing(text):
    stop_list = []
    stop_list.extend(stopwords.words(['hungarian','swedish','kazakh','norwegian','finnish','arabic','indonesian','portuguese','turkish','azerbaijani', 'slovene', 'spanish', 'danish', 'nepali', 'romanian', 'greek', 'dutch', 'README', 'tajik', 'german', 'english', 'russian', 'french', 'italian']))
    stop_list.extend(
        ['yo','dont','nigga','uh', 'got', 'oh', 'im', 'na', 'from', 'subject', 're', 'edu', 'use', 'not', 'would', 'say', 'could', '_', 'be',
         'know', 'good', 'go', 'get', 'ah', 'bout','yeah','le','ayy','u','bitch','eh','wa',
         'do', 'done', 'try', 'many', 'some', 'nice', 'thank', 'think', 'see', 'rather', 'easy', 'easily', 'lot',
         'lack', 'make', 'want', 'seem', 'run', 'need', 'even', 'right', 'line', 'even', 'also', 'may', 'take', 'come'])
    exclude = set(string.punctuation)
    lemma = WordNetLemmatizer()

    text = text.rstrip()
    normalized = " ".join(lemma.lemmatize(i) for i in text.split())
    punc_free = ''.join(i for i in normalized if i not in exclude)
    stop_free = " ".join([i for i in punc_free.lower().split() if((i not in stop_list) and (not i.isdigit()))])
    return stop_free

In [5]:
def prepare_corpus(doc_clean):
    """
    Input  : clean document
    Purpose: create term dictionary of our courpus and Converting list of documents (corpus) into Document Term Matrix
    Output : term dictionary and Document Term Matrix
    """
    # Creating the term dictionary of our courpus, where every unique term is assigned an index. dictionary = corpora.Dictionary(doc_clean)
    dictionary = corpora.Dictionary(doc_clean)
    # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
    # generate LDA model
    return dictionary,doc_term_matrix

In [6]:
def create_gensim_lsa_model(doc_clean,number_of_topics,words):
    """
    Input  : clean document, number of topics and number of words associated with each topic
    Purpose: create LSA model using gensim
    Output : return LSA model
    """
    dictionary,doc_term_matrix=prepare_corpus(doc_clean)
    # generate LSA model
    lsamodel = LsiModel(doc_term_matrix, num_topics=number_of_topics, id2word = dictionary)  # train model
    print(lsamodel.print_topics(num_topics=number_of_topics, num_words=words))
    return lsamodel

In [7]:
def compute_coherence_values(dictionary, doc_term_matrix, doc_clean, stop, start=2, step=3):
    """
    Input   : dictionary : Gensim dictionary
              corpus : Gensim corpus
              texts : List of input texts
              stop : Max num of topics
    purpose : Compute c_v coherence for various number of topics
    Output  : model_list : List of LSA topic models
              coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, stop, step):
        # generate LSA model
        model = LsiModel(doc_term_matrix, num_topics=number_of_topics, id2word = dictionary)  # train model
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=doc_clean, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
    return model_list, coherence_values

In [8]:
def plot_graph(doc_clean,start, stop, step):
    dictionary,doc_term_matrix=prepare_corpus(doc_clean)
    model_list, coherence_values = compute_coherence_values(dictionary, doc_term_matrix,doc_clean,
                                                            stop, start, step)
    # Show graph
    x = range(start, stop, step)
    plt.plot(x, coherence_values)
    plt.xlabel("Number of Topics")
    plt.ylabel("Coherence score")
    plt.legend(("coherence_values"), loc='best')
    plt.show()


In [9]:
df = importing_Lyrics_df()

In [10]:
text_clean=[]
for text in df['LYRICS']:
    text_clean.append(pre_processing(text).split())

In [11]:
dictionary,corpus = prepare_corpus(text_clean)

In [12]:
number_of_topics=7
words=10
lsamodel=create_gensim_lsa_model(text_clean,number_of_topics,words)

[(0, '0.491*"love" + 0.452*"like" + 0.235*"cant" + 0.173*"touch" + 0.165*"stop" + 0.148*"falling" + 0.134*"youre" + 0.133*"baby" + 0.120*"aint" + 0.108*"wanna"'), (1, '-0.579*"already" + -0.579*"gone" + 0.345*"love" + -0.196*"neo" + 0.172*"touch" + -0.157*"away" + 0.133*"falling" + -0.091*"ijen" + -0.091*"youre" + 0.079*"cuz"'), (2, '-0.520*"love" + 0.463*"like" + -0.273*"touch" + -0.250*"already" + -0.248*"gone" + -0.224*"falling" + 0.177*"cant" + 0.129*"stop" + -0.114*"cuz" + 0.112*"aint"'), (3, '0.460*"gonna" + 0.344*"wanna" + -0.300*"like" + -0.217*"stop" + -0.215*"cant" + -0.195*"touch" + 0.195*"baby" + -0.182*"falling" + 0.145*"youre" + 0.135*"hot"'), (4, '0.371*"cant" + 0.344*"stop" + -0.275*"touch" + -0.258*"issa" + -0.244*"falling" + 0.218*"hey" + 0.173*"hot" + -0.160*"aint" + 0.157*"cuz" + -0.151*"ooh"'), (5, '-0.858*"issa" + 0.193*"touch" + 0.186*"falling" + -0.162*"cuz" + -0.135*"hey" + -0.133*"love" + -0.129*"hot" + 0.094*"youre" + 0.080*"gonna" + -0.080*"real"'), (6, '0.4

In [13]:
topics = lsamodel.print_topics(num_words= 10)

In [14]:

def get_topic_details(lsamodel, corpus):
    topic_details_df = pd.DataFrame()
    for i, row in enumerate(lsamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = lsamodel.show_topic(topic_num)
                topic_details_df = topic_details_df.append(pd.Series([topic_num, prop_topic]),
                ignore_index=True)
    topic_details_df.columns = ['Dominant_Topic', '% Score']
    return topic_details_df

In [15]:
contents = pd.DataFrame({'Original Text': text_clean})

In [16]:
topic_details = pd.concat([get_topic_details(lsamodel, corpus), contents], axis=1)

  topic_details_df = topic_details_df.append(pd.Series([topic_num, prop_topic]),
  topic_details_df = topic_details_df.append(pd.Series([topic_num, prop_topic]),


In [17]:
dom_tp = topic_details['Dominant_Topic']
topic_details['flag'] = np.where((dom_tp == 2.0) | (dom_tp == 3.0) | (dom_tp == 4.0), 1, 0)
print(topic_details.head())

   Dominant_Topic    % Score  \
0             0.0  24.770249   
1             0.0   5.763429   
2             0.0   9.829807   
3             0.0   7.947094   
4             0.0   5.992174   

                                       Original Text  flag  
0  [grape, street, gslide, sweet, shay, shay, lik...     0  
1  [gonna, put, suit, tie, steady, speakin, work,...     0  
2  [kill, kill, radio, killer, music, dealer, son...     0  
3  [lil, steady, speaking, low, project, wall, ta...     0  
4  [x, loaded, bro, never, church, prayer, lost, ...     0  
