In [None]:
import pandas as pd
import spacy
from gensim.corpora import Dictionary
import numpy as np
from gensim.models import LdaMulticore
from ast import literal_eval
from pprint import pprint
import pyLDAvis
import pyLDAvis.gensim_models

In [None]:
df = pd.read_csv("Bi-Tri.csv",converters={"Words": literal_eval})

In [None]:
from gensim.models import Phrases
from gensim.models.phrases import Phraser 

# Build bigram and trigram Phrases objects
bigram_phrases = Phrases(df.Words[:], min_count=10)
trigram_phrases = Phrases(bigram_phrases[df.Words[:]], min_count=5)

# Create Phraser model object for faster processing by passing in the Phrases object (Gensim has a confusing API...)
bigram_model = Phraser(bigram_phrases)
trigram_model = Phraser(trigram_phrases)

trigrams = [trigram_model[bigram_model[word]] for word in df.Words]



In [None]:
trigrams

In [None]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
allowed_postags=['NOUN','ADJ','VERB','ADV']
lemmatized_words = []
for sent in trigrams:
    doc = nlp(" ".join(sent))
    lemmatized_words.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])

In [None]:
print(lemmatized_words[0])

In [None]:
id2word = Dictionary(lemmatized_words)
id2word.filter_extremes(no_below=10, no_above=0.4)
id2word.compactify()
corpus = [id2word.doc2bow(word) for word in lemmatized_words]

In [None]:
num_topics = 4


lda_model = LdaMulticore(corpus=corpus, 
                             id2word=id2word, 
                             num_topics=num_topics, 
                             random_state=1,
                             chunksize=30,
                             passes=20,
                             alpha=0.31,
                             eta=0.91,
                             eval_every=1,
                             per_word_topics=True,
                             workers=1)

In [None]:
pprint(lda_model.print_topics(num_words=15))


Visualizations of the clusters

In [None]:
pyLDAvis.enable_notebook()
pyLDAvis.gensim_models.prepare(lda_model, corpus,id2word)

In [None]:
from gensim.models import CoherenceModel

coherence_model_lda = CoherenceModel(model=lda_model, 
                                         texts=lemmatized_words, 
                                         dictionary=id2word, 
                                         coherence='c_v')
coherence_model_lda.get_coherence()

In [None]:
topic_vecs = []
for i in range(len(df.Words)):
    top_topics = lda_model.get_document_topics(corpus[i], minimum_probability=0.0)
    topic_vec = [top_topics[i][1] for i in range(num_topics)]
    topic_vecs.append(topic_vec)
    
topic_vecs[0]

In [None]:
topic_columns = ['Stories', 'Minority_Issues', 'UK', 'Not_Clean']
LDA_probs = pd.DataFrame(data=topic_vecs, columns=topic_columns, index=df.index)
df = pd.concat([df, LDA_probs], axis=1)

df.head()

Make sure to change the topics list contents as the clusters may change as you run the code

In [None]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns; sns.set(color_codes=True)
%config InlineBackend.figure_formats = ['retina']

topics = ['Stories', 'Minority_Issues', 'UK', 'Not_Clean']
ax = sns.barplot(x=df[topics].mean().index, y=df[topics].mean())
ax.set_xticklabels(topics, rotation=40, ha='right')
ax.set_title('Mean Topic Probabilities Across The Entire Dataset')
ax.set(xlabel='Topics', ylabel='Mean Percentage per Transcript', ylim=(0, .7))



In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

X = df[topics]
X = StandardScaler().fit_transform(X)
X.shape

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, silhouette_samples
import matplotlib.cm as cm

temp_dict = {}
inertias = []
for n_clusters in range(2,15):
    # Initialize the clusterer with n_clusters value and a random generator
    # seed of 10 for reproducibility.
    clusterer = KMeans(n_clusters=n_clusters, random_state=1)
    cluster_labels = clusterer.fit_predict(X)

    # The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed
    # clusters
    silhouette_avg = silhouette_score(X, cluster_labels)
    print("For n_clusters =", n_clusters,
          "The average silhouette_score is :", silhouette_avg)
    temp_dict[n_clusters] = [silhouette_avg] 
    
    inertia = clusterer.inertia_
    print("\tThe inertia is :", inertia)
    inertias.append(inertia)

In [None]:
sns.set(font_scale=1.2)
sns.set_style('ticks')
s_scores = pd.DataFrame(temp_dict).T
ax = sns.lineplot(x=s_scores.index, y=s_scores[0], color='teal')
# ax.set_yticks([])  # Clear the yaxis labels / ticks
ax.set_xticks(range(2,14))
ax.set_ylabel('Silhouette score')
ax.set_xlabel('Clusters')

In [None]:
ax = sns.lineplot(range(2,15), inertias, color='teal')
ax.set_ylabel('SSE (inertia)')
ax.set_xlabel('Clusters')
# ax.figure.tight_layout()
# ax.figure.savefig('./images/LDAelbow.png', dpi=200)

In [None]:
clusterer = KMeans(n_clusters=4, random_state=10)
df['cluster_LDA'] = clusterer.fit_predict(X)

In [None]:
df.to_csv("Clustered.csv", index = False)

In [None]:
df['cluster_LDA'] = clusterer.fit_predict(X)

for cluster in range(4):
    # Create a subplot with 1 row and 1 columns
    fig, (ax1) = plt.subplots(1, 1)
    fig.set_size_inches(4, 4)
     
    ax = sns.barplot(x=df[topics].mean().index, y=df[topics].loc[df.cluster_LDA == cluster].mean())
    ax.set_xticklabels(topics, rotation=40, ha='right')   
    ax.set_title(f'cluster: {cluster}')



In [None]:
df.cluster_LDA.value_counts()