In [1]:
!pip install nltk==3.4
!pip install yellowbrick -U

In [21]:
import base64
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Plotly imports
import plotly.offline as py
import plotly.graph_objs as go
import plotly.tools as tls
py.init_notebook_mode(connected=True)

# Other imports
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
import json

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
import string

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans



import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

References :
1. https://www.kaggle.com/arthurtok/spooky-nlp-and-topic-modelling-tutorial
2. https://towardsdatascience.com/end-to-end-topic-modeling-in-python-latent-dirichlet-allocation-lda-35ce4ed6b3e0
3. https://github.com/ElizaLo/NLP-Natural-Language-Processing

In [3]:
df = pd.read_json('../input/careerdb/database.json')

In [4]:
df.rename(columns={'pregrado':'undergraduate',
                   'Universidad':'university',
                   'detalles':'details',
                   'Descripcion':'description',
                  'Registro calificado':'Qualified record',
                  'Nivel de formación':'Level of Education',
                  'Tipo de formación':'Type of training',
                  'Título otorgado':'Title awarded',
                  'Modalidad':'Modality',
                  'Duración':'Duration',
                  'Créditos':'Credits',
                  'Ciudad':'Town'},inplace=True)
df.head()

In [5]:
df.info()

In [6]:
df.undergraduate.value_counts()

In [7]:
# Delete .....
df1 = df[(df['undergraduate'].str.find('especializacion')==-1) &
    (df['undergraduate'].str.find('maestria')==-1) & 
    (df['undergraduate'].str.find('doctorado')==-1) & 
    (df['undergraduate'].str.find('tecnologia')==-1)]

df1.info()

In [8]:
df1.undergraduate.value_counts()

In [9]:
def replace_str(x):
    if (str(x).find('administracion') != -1) or (str(x).find('contaduria') != -1) \
    or (str(x).find('economia') != -1) or (str(x).find('negocios') != -1) \
    or (str(x).find('mercadeo') != -1) or (str(x).find('finanzas') != -1) \
    or (str(x).find('publicidad') != -1) or (str(x).find('comercio') != -1):
        return 'BUSINESS'
    elif str(x).find('ingenieria') != -1:
        return 'ENGINEERING'
    elif (str(x).find('derecho') != -1)   or (str(x).find('psico') != -1) \
    or (str(x).find('comunicacion') != -1) or (str(x).find('social') != -1) \
    or (str(x).find('pedagogia') != -1) or (str(x).find('filosofia') != -1) \
    or (str(x).find('educacion infantil') != -1) or (str(x).find('teologia') != -1) \
    or (str(x).find('antropologia') != -1) or (str(x).find('ciencia politica') != -1) \
    or (str(x).find('historia') != -1) or (str(x).find('sociologia') != -1) \
    or (str(x).find('literatura') != -1) or (str(x).find('ciencias politicas') != -1) \
    or (str(x).find('relaciones internacionales') != -1):
        return 'HUMANITIES AND SOCIAL SCIENCE'
    elif (str(x).find('medicina') != -1) or (str(x).find('enfermeria') != -1) \
    or (str(x).find('odontologia') != -1) or (str(x).find('salud en el trabajo') != -1) \
    or (str(x).find('quirurgica') != -1) :
        return 'HEALTH & MEDICINE'
    elif (str(x).find('educacion fisica') != -1) or (str(x).find('fisioterapia') != -1):
        return 'SPORTS AND PHYSICAL TRAIN'
    elif (str(x).find('arquitectura') != -1) or (str(x).find('music') != -1) \
    or (str(x).find('diseño') != -1) or (str(x).find('artes') != -1) \
    or (str(x).find('fotografia') != -1):
        return 'ARTS AND DESIGN'
    elif (str(x).find('matematicas') != -1) or (str(x).find('fisica') != -1) \
    or (str(x).find('estadistica') != -1) or (str(x).find('biologia') != -1) \
    or (str(x).find('ciencias naturales') != -1) or (str(x).find('quimica') != -1) :
        return 'MATH AND PHYSICAL SCIENCES'
    else:
        return 'OTHER'

In [10]:
df1['final_cat'] = df1['undergraduate'].apply(replace_str)
df1['final_cat'].value_counts(normalize = True)

In [11]:
# Droping OTHER Undergraduate
df1 = df1[df1['final_cat'] != 'OTHER']
# Dropping void Description
df1= df1[df1['description']!='']
# replacing big space
df1['description'] = df1['description'].str.strip('').replace('  ', ' ')

In [12]:
df1.head(2)

In [13]:
data = [go.Bar(
            x = df1.final_cat.unique(),
            y = df1.final_cat.value_counts().values,
            marker= dict(colorscale='Jet',
                         color = df1.final_cat.value_counts().values
                        ),
            text='Text entries attributed to Final Category'
    )]

layout = go.Layout(
    title='Target variable distribution'
)

fig = go.Figure(data=data, layout=layout)

py.iplot(fig, filename='basic-bar')

In [14]:
all_words = df1['undergraduate'].str.split(expand=True).unstack().value_counts()
data = [go.Bar(
            x = all_words.index.values[2:50],
            y = all_words.values[2:50],
            marker= dict(colorscale='Jet',
                         color = all_words.values[2:100]
                        ),
            text='Word counts'
    )]

layout = go.Layout(
    title='Top 50 (Uncleaned) Word frequencies in the Undergraduate'
)

fig = go.Figure(data=data, layout=layout)

py.iplot(fig, filename='basic-bar')

In [15]:
all_words = df1['description'].str.split(expand=True).unstack().value_counts()
data = [go.Bar(
            x = all_words.index.values[2:50],
            y = all_words.values[2:50],
            marker= dict(colorscale='Jet',
                         color = all_words.values[2:100]
                        ),
            text='Word counts'
    )]

layout = go.Layout(
    title='Top 50 (Uncleaned) Word frequencies in the Description'
)

fig = go.Figure(data=data, layout=layout)

py.iplot(fig, filename='basic-bar')

In [16]:
all_words = df1['university'].str.split(expand=True).unstack().value_counts()
data = [go.Bar(
            x = all_words.index.values[2:50],
            y = all_words.values[2:50],
            marker= dict(colorscale='Jet',
                         color = all_words.values[2:100]
                        ),
            text='Word counts'
    )]

layout = go.Layout(
    title='Top 50 (Uncleaned) Word frequencies in the University'
)

fig = go.Figure(data=data, layout=layout)

py.iplot(fig, filename='basic-bar')

In [17]:
stopwords_new = ['universidad','programa',
                 'formacion','desarrollo','profesionales',
                 'colombia', 'estudiante', 'quindio', 'cooperativa', 'santo', 'tomas',
                 'sergio','arboleda','pontificia','bolivariana']

In [18]:
nltk.download('punkt')
nltk.download('stopwords')
spanish_stopwords = stopwords.words('spanish')
spanish_stopwords.extend(stopwords_new)

def tokenize(sentence):
    return [token for token in nltk.word_tokenize(sentence)]
def remove_stopwords(sentence):
    return [token for token in nltk.word_tokenize(sentence) if (token.lower() not in spanish_stopwords) and (token.lower() !=' ') and (token not in string.punctuation)]

In [19]:
df1['token'] = df1['description'].apply(lambda x: tokenize(x))
df1['token_no_stopwords'] = df1['description'].apply(lambda x: remove_stopwords(x))
df1['bigram'] = df1['token_no_stopwords'].apply(lambda x: list(ngrams(x, 2)))

In [20]:
df1.head()

In [33]:
vectorizer = TfidfVectorizer(stop_words=spanish_stopwords)
X = vectorizer.fit_transform(df1['description'])

model = KMeans( init='k-means++', max_iter=400, random_state=2021,)

In [34]:
from yellowbrick.cluster import KElbowVisualizer

# k is range of number of clusters.
visualizer = KElbowVisualizer(model, k=(2,10), timings= True,)
visualizer.fit(X)        # Fit data to visualizer
visualizer.show()        # Finalize and render figure
plt.show()

In [35]:
visualizer = KElbowVisualizer(model, k=(2,10), metric='silhouette', timings= True)
visualizer.fit(X)        # Fit the data to the visualizer
visualizer.show()        # Finalize and render the figure
plt.show()

In [36]:
n_clusters = 7

def get_clusters_top_words(n_clusters):
    model = KMeans(n_clusters, init='k-means++', max_iter=400, random_state=2021)
    model.fit(X)
    order_centroids = model.cluster_centers_.argsort()[:, ::-1]
    terms = vectorizer.get_feature_names()

    for i in range(n_clusters):
        print('Cluster %d:' % i),
        for ind in order_centroids[i, :5]:
            print(' %s' % terms[ind])

get_clusters_top_words(6)

In [37]:
get_clusters_top_words(3)

In [38]:
from collections import Counter

bigram_list = [pair for row in df1['token_no_stopwords'] for pair in ngrams(row, 2)]
bigram = Counter(bigram_list).most_common()
bigram = pd.DataFrame.from_records(bigram, columns=['gram', 'count'])
bigram[:20]

In [39]:
words = (df1['token_no_stopwords'].apply(lambda x: ' '.join(x))).str.cat(sep=' ').split()
Counter(words).most_common(50)

In [40]:
import re
data = df1['token_no_stopwords'].str.join(' ').values.tolist()
data = [re.sub('\s+', ' ', sent) for sent in data] # Remove new line characters
data = [re.sub("\'", "", sent) for sent in data] # Remove distracting single quotes

In [50]:
# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this

In [42]:
def sentence_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sentence_to_words(data))
print(data_words[:1])

# Creating Bigram and Trigram Models

In [43]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

# Remove Stopwords, Make Bigrams and Lemmatize

The bigrams model is ready. Let’s define the functions to remove the stopwords, make bigrams and lemmatization and call them sequentially.

In [44]:
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in spanish_stopwords] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

In [45]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [None]:
!python -m spacy download es_core_news_sm

In [47]:
import es_core_news_sm

# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = es_core_news_sm.load()

# Do lemmatization
data_lemmatized = lemmatization(data_words_bigrams)

print(data_lemmatized[:1])

# Create the Dictionary and Corpus needed for Topic Modeling

In [48]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

In [49]:
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:5]]

# Building the Topic Model

* We have everything required to train the LDA model. In addition to the corpus and dictionary, you need to provide the number of topics as well.

* Apart from that, alpha and eta are hyperparameters that affect sparsity of the topics. According to the Gensim docs, both defaults to 1.0/num_topics prior.

* chunksize is the number of documents to be used in each training chunk. update_every determines how often the model parameters should be updated and passes is the total number of training passes.

In [51]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, # corpus
                                           id2word=id2word, # index to word
                                           num_topics=7,  # 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

# View the topics in LDA model

In [55]:
best_model = None
top_score = 0
for x in range(1,16):
    print(f'Number of topics:{x}')
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=x, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)
    
    # Compute Perplexity
    print(f'Perplexity for {x} topics: {lda_model.log_perplexity(corpus)}')  # a measure of how good the model is. lower the better.

    # Compute Coherence Score
    coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    print(f'Coherence Score for {x} topics: {coherence_lda} \n')
    if coherence_lda > top_score:
        best_model = x
        top_score = coherence_lda
print(f'\nBest Results with {best_model} topics with a Coherence of {top_score}')

* Perplexity:  -8.868750111308447

* Coherence Score:  0.4369763457091124

**There we have a coherence score of 0.43.**

# Compute Model Perplexity and Coherence Score

Model perplexity and topic coherence provide a convenient measure to judge how good a given topic model is. In my experience, topic coherence score, in particular, has been more helpful.

In [53]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

# Visualize the topics-keywords using pyLDAvis

In [54]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

# Building LDA Mallet Model

In [None]:
# download the Mallet Model
!wget http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip
!unzip ./mallet-2.0.8.zip

# LDA MALLET & GENSIM WRAPPER is Removed in VERSION 4.X of GENSIM

# How to find the optimal number of topics for LDA?

In [74]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.wrappers.LdaMallet('/content/mallet-2.0.8/bin/mallet', corpus=corpus, num_topics=num_topics, id2word=id2word)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [None]:
%%time
# Can take a long time to run.
model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=data_lemmatized, start=2, limit=20, step=1)

In [None]:
# Show graph
limit=20; start=2; step=1;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.xticks(np.arange(start, limit, step=step))
plt.show()

In [None]:
# Print the coherence scores
for m, cv in zip(x, coherence_values):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

In [None]:
optimal_model = model_list[3]

# Finding the dominant topic in each sentence

In [None]:
def format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=optimal_model, corpus=corpus, texts=data)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic.head(10)

In [None]:
# Print the Keyword in the 5 topics
pprint(optimal_model.print_topics())
doc_lda = optimal_model[corpus]

# Find the most representative document for each topic

In [None]:
df_final = df.merge(df_dominant_topic, how='inner', left_index=True, right_index=True)

In [None]:
df_final.pivot_table(index='final_cat', columns='Dominant_Topic', values='Document_No', aggfunc='count')

In [None]:
df_final.pivot_table(index='undergraduate', columns='Dominant_Topic', values='Document_No', aggfunc='count').head(10)