## 5 Actions required

#### 1 Mallet_path 
download file http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip <br>
unzip it <br>
save the path to the unzipped file and paste it here below

In [1]:
mallet_path = '/Documents/mallet-2.0.8/bin/mallet'

#### 2 Data import
import data from csv with the text to analyse 

In [3]:
import pandas as pd
df = pd.read_csv('/Documents/Data/dataset.csv', low_memory = False)

rename column containing the text to analyse 'content', see example below

In [4]:
df = df.rename(columns={"column": "content"})
df = df[df['content'].notnull()]

#### 3 Define language of the content to analyse and download the requirements
Follow this link, https://spacy.io/usage/models go to quickstart,<br>
select the language and loading style (suggested 'import as module'). <br>

Execute the line in the terminal and paste and execute the import and nlp line in the cell underneath here. See the example below for German

In [5]:
import de_core_news_sm
nlp = de_core_news_sm.load()

name the variable 'language' for the stop words https://pypi.org/project/stop-words/

In [6]:
language = 'german'

#### 4 Output data 

Define the location and name of the analysis output

In [47]:
output_path = '/Documents/Data/'

In [48]:
output_name = 'topic_analysis'

#### 5 Missing imports

Most likely some libraries are missing from your machine, you can install them directly from here using the command: !pip install <library name>

!pip install gensim

pip download de_core_news_md

## Imports

In [7]:
import nltk; nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/giacomofederle/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
import re
import numpy as np
import pandas as pd

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy


# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [9]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words(language)
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

## Functions

In [10]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

In [11]:
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

In [12]:
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

In [13]:
def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

In [14]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [15]:
def ideal_number_topics(id2word, corpus, texts, limit, start=2, step=3):
    dicts = {}
    for num_topics in range(start, limit, step):
        model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, id2word = id2word)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary =id2word,
                                        coherence='c_v').get_coherence()
        dicts[num_topics] = coherencemodel
    max_coherence = max(dicts.values())
    ideal_number_of_topics = [k for k, v in dicts.items() if v == max_coherence]
    return ideal_number_of_topics

In [16]:
def format_topics_sentences(ldamodel, corpus, texts):
    sent_topics_df = pd.DataFrame()
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row[0], key=lambda x: (x[1]), reverse=True)
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(
                    pd.Series([int(topic_num), round(prop_topic, 4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return (sent_topics_df)

## More code

In [17]:
data = df.content.values.tolist()
data = [re.sub('\s+', ' ', sent) for sent in data]
data = [re.sub("\'", "", sent) for sent in data]

In [18]:
data_words = list(sent_to_words(data))

In [20]:
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100)
bigram_mod = gensim.models.phrases.Phraser(bigram)
#trigram_mod = gensim.models.phrases.Phraser(trigram)

In [29]:
data_words_nostops = remove_stopwords(data_words)
data_words_bigrams = make_bigrams(data_words_nostops)
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

In [30]:
id2word = corpora.Dictionary(data_lemmatized)
texts = data_lemmatized
corpus = [id2word.doc2bow(text) for text in texts]

In [31]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [32]:
n_topics = ideal_number_topics(id2word = id2word,
                               corpus=corpus,
                               texts=data_lemmatized, start=2,
                               limit=15,
                               step=2)

In [33]:
n_topics = int(n_topics[0])

In [34]:
lda_model_ideal = gensim.models.ldamodel.LdaModel(corpus = corpus,
                                              id2word = id2word,
                                              num_topics = n_topics,
                                              random_state = 100,
                                              update_every = 1,
                                              chunksize = 100,
                                              passes = 10,
                                              alpha = 'auto',
                                              per_word_topics = True)

In [35]:
df_topic_sents_keywords = format_topics_sentences(ldamodel = lda_model_ideal,
                                                  corpus = corpus,
                                                  texts = data)

In [36]:
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No',
                                 'Dominant_Topic',
                                 'Topic_Perc_Contrib', 
                                 'Keywords', 
                                 'Most_representative_comment']

In [37]:

topic_counts = df_topic_sents_keywords['Dominant_Topic'].value_counts()
topic_contribution = round(topic_counts/topic_counts.sum(), 4)
topic_num_keywords = df_topic_sents_keywords[['Dominant_Topic',
                                              'Topic_Keywords']]
df_dominant_topics = pd.concat([topic_num_keywords,
                                topic_counts, 
                                topic_contribution],
                                axis=1)
df_dominant_topics.columns = ['Dominant_Topic',
                              'Topic_Keywords',
                              'Num_Documents', 
                              'Perc_Documents']

In [38]:
sent_topics_sorteddf_mallet = pd.DataFrame()
sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

In [39]:
for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], 
                                             ascending=[0]).head(1)], 
                                             axis=0)  
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)
sent_topics_sorteddf_mallet.columns = ['Topic_Num', 
                                       "Topic_Perc_Contrib", 
                                       "Keywords", 
                                       "Most_representative_comment"]
# what is topic perc contrib

In [40]:
sent_topics_sorteddf_mallet['Topic_Num'] = sent_topics_sorteddf_mallet['Topic_Num']

In [41]:
sent_topics_sorteddf_mallet['Topic_Num'] = sent_topics_sorteddf_mallet['Topic_Num'].astype(int)

In [42]:
topic_counts = pd.DataFrame(df_topic_sents_keywords['Dominant_Topic'].value_counts())
topic_counts['Topic_Num'] = topic_counts.index
topic_counts['Topic_Num'] = topic_counts['Topic_Num']

In [43]:
df2 = pd.merge(sent_topics_sorteddf_mallet, 
               topic_counts,
              on = 'Topic_Num')

In [44]:
df2 = df2.rename(columns={"Dominant_Topic": "Count"})

In [45]:
df2['Topic_Num'] = df2.index + 1

In [49]:
df2.to_csv(output_path+output_name+'.csv')