https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/

In [1]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

import os
import pickle

In [2]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [3]:
def save_pickle(filename, data, gensim_files_dir):
    with open(os.path.join(gensim_files_dir, filename +'.pickle'), 'wb') as data_file:
        pickle.dump(data, data_file)

In [4]:
def load_saved_pickle(filename, gensim_files_dir):
    file_path = filename + '.pickle'
    if not os.path.exists(os.path.join(gensim_files_dir, file_path)):
        return False
    else:
        with open(os.path.join(gensim_files_dir, file_path), 'rb') as data_file:
            return pickle.load(data_file)

In [5]:
def data_status(name, val):
    pprint(name + ' => {0}'.format(str(val) if not val else 'Data loaded'))

In [6]:
gensim_files_dir = 'gensim_files'  # path to load data if exists
# Check if saved file exisits
data = load_saved_pickle('data', gensim_files_dir)
data_status('data', data)
id2word = load_saved_pickle('id2word', gensim_files_dir)
data_status('id2word', id2word)
texts = load_saved_pickle('texts', gensim_files_dir)
data_status('texts', texts)
corpus = load_saved_pickle('corpus', gensim_files_dir)
data_status('corpus', corpus)
bigram_mod = load_saved_pickle('bigram_mod', gensim_files_dir)
data_status('bigram_mod', bigram_mod)
trigram_mod = load_saved_pickle('trigram_mod', gensim_files_dir)
data_status('trigram_mod', trigram_mod)
data_words = load_saved_pickle('data_words', gensim_files_dir)
data_status('data_words', data_words)
data_words_nostops = load_saved_pickle('data_words_nostops', gensim_files_dir)
data_status('data_words_nostops', data_words_nostops)
data_words_bigrams = load_saved_pickle('data_words_bigrams', gensim_files_dir)
data_status('data_words_bigrams', data_words_bigrams)

'data => Data loaded'
'id2word => Data loaded'
'texts => Data loaded'
'corpus => Data loaded'
'bigram_mod => Data loaded'
'trigram_mod => Data loaded'
'data_words => Data loaded'
'data_words_nostops => Data loaded'
'data_words_bigrams => Data loaded'


In [7]:
json_data_path = os.path.join('output','regex_json_file_dep')
df = pd.read_json(json_data_path)
df.head()

Unnamed: 0,content
10023,Mediterranean Oil and Gas Plc Annual Report 20...
10024,Mediterranean Oil and Gas Plc Annual Report a...
10025,Mediterranean Oil and Gas Plc Annual Report a...
10050,MEGGITT PLC REPORT AND ACCOUNTS 2013 STRATEGIC...
10051,SUPPLEMENTARY INFORMATION FINANCIAL STATEMENTS...


In [8]:
if not data:
    # Convert to list
    data = df.content.values.tolist()

    # Remove Emails
    data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

    # Remove new line characters
    data = [re.sub('\s+', ' ', sent) for sent in data]

    # Remove distracting single quotes
    data = [re.sub("\'", "", sent) for sent in data]
    
    save_pickle('data', data, gensim_files_dir)
pprint(data[:1])

['Mediterranean Oil and Gas Plc Annual Report 2011 www. medoilgas. Our '
 'strategy for growth Regional Operator Leverage our competitive advantage '
 'that lies in the breadth and depth of our Italybased team that manages the '
 'fullvalue chain of our EandP business together with our AIMlisting '
 'knowledgeable management team and strong support from our key shareholders '
 'Financial Strength Being debt free we will use the steady income from our '
 'onshore and offshore gas production to underwrite our operating costs '
 'support asset maturation and small capital programmes. Mediterranean Oil and '
 'Gas Plc Annual Report 2011 www. medoilgas. BUSINESS REVIEW CORPORATE '
 'GOVERNANCE FINANCIAL STATEMENTS Balanced Portfolio Use our Resources Factory '
 'to our advantage. Grow production and move resources to reserves by maturing '
 'the portfolio in support of our production growth targets. Balance frontier '
 'exploration with asset maturation and good reservoir management Growth 

 'Production Sharing Contract signed with the Maltese Government on date and '
 'its Addendum signed on date. Draw downs under the Loan Facility at date '
 'amounted to nil date nil. Subsequent to the reporting date on 7 February '
 '2012 the Company borrowed 2. 0m the Principal under the Loan Facility. '
 'Repayment of the entire Principal and accrued interest thereon was completed '
 'on date Refer to Notes 25 and 26 for further information. Management has '
 'prepared cash flow projections which reflect the effect of the '
 'recapitalisation and indicate that the Group can continue to meet its '
 'liabilities as they fall due and meet minimum spend commitments on its '
 'licenses for a period of not less than 12 months from the date of the '
 'financial statements. Basis of accounting and adoption of new and revised '
 'standards a Standards amendments and interpretations effective in 2011 The '
 'following new standards and amendments to standards are mandatory for the '
 'first ti

 'from the Guendalina gas field in December 2011 to ENI S. p. A. which had '
 'previously been sold to Elettrogas S. p. A. The subsequent adjustments '
 'necessary to revenue and accounts receivable have been made and are '
 'reflected in these financial statements. Grant of options to directors On '
 'date 11500000 options were granted to directors over ordinary shares of the '
 'Company. The options have an exercise price of 6 pence and are exercisable '
 'between the date of grant and the third anniversary of the date of grant On '
 'date 6000000 options were granted to William Higgs over ordinary shares of '
 'the Company. The options have an exercise price of 6 pence and are '
 'exercisable between the first and third anniversaries of date On date '
 '2000000 options were granted to employees over ordinary shares of the '
 'Company. The options have an exercise price of 6 pence and are exercisable '
 'between the date of grant and the third anniversary of the date of grant and '
 

 'Academy of Engineering. Mr. Bill Higgs Chief Executive Bill Higgs has over '
 '23 years of global exploration development and operations experience working '
 'with Chevron. Most recently he was senior vice president of Operations for '
 'Saudi Arabian Chevron SAC in Saudi ArabiaKuwait. In his role Bill was the '
 'senior operations representative for SAC that jointly with Kuwait Gulf Oil '
 'Company operates exploration development and production activities for the '
 'onshore Partitioned Zone shared by the Kingdom of Saudi Arabia and Prior to '
 'this Bill was chief strategist for Chevron Corporation and the secretary of '
 'the Corporate Strategy and Planning Committee where he facilitated strategic '
 'dialogue and strategy setting with Chevrons executive leaders. Bills '
 'previous roles at Chevron include manager of Reservoir Management for '
 'Tengizchevroil in Kazakhstan asset manager for the BBLT development in '
 'Angola business development and planning manager for Sasol C

In [9]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations
if not data_words:
    data_words = list(sent_to_words(data))
    save_pickle('data_words', data_words, gensim_files_dir)
print(data_words[:1])

[['mediterranean', 'oil', 'and', 'gas', 'plc', 'annual', 'report', 'www', 'medoilgas', 'our', 'strategy', 'for', 'growth', 'regional', 'operator', 'leverage', 'our', 'competitive', 'advantage', 'that', 'lies', 'in', 'the', 'breadth', 'and', 'depth', 'of', 'our', 'italybased', 'team', 'that', 'manages', 'the', 'fullvalue', 'chain', 'of', 'our', 'eandp', 'business', 'together', 'with', 'our', 'aimlisting', 'knowledgeable', 'management', 'team', 'and', 'strong', 'support', 'from', 'our', 'key', 'shareholders', 'financial', 'strength', 'being', 'debt', 'free', 'we', 'will', 'use', 'the', 'steady', 'income', 'from', 'our', 'onshore', 'and', 'offshore', 'gas', 'production', 'to', 'underwrite', 'our', 'operating', 'costs', 'support', 'asset', 'maturation', 'and', 'small', 'capital', 'programmes', 'mediterranean', 'oil', 'and', 'gas', 'plc', 'annual', 'report', 'www', 'medoilgas', 'business', 'review', 'corporate', 'governance', 'financial', 'statements', 'balanced', 'portfolio', 'use', 'our',

In [10]:
# Build the bigram and trigram models
if not bigram_mod:
    bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
if not trigram_mod:
    trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
if not bigram_mod:
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    save_pickle('bigram_mod', bigram_mod, gensim_files_dir)
if not trigram_mod:
    trigram_mod = gensim.models.phrases.Phraser(trigram)
    save_pickle('trigram_mod', trigram_mod, gensim_files_dir)
# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

['mediterranean_oil', 'and', 'gas', 'plc', 'annual', 'report', 'www_medoilgas', 'our', 'strategy', 'for', 'growth', 'regional', 'operator', 'leverage', 'our', 'competitive_advantage', 'that', 'lies', 'in', 'the', 'breadth', 'and', 'depth', 'of', 'our', 'italybased', 'team', 'that', 'manages', 'the', 'fullvalue', 'chain', 'of', 'our', 'eandp', 'business', 'together', 'with', 'our', 'aimlisting', 'knowledgeable', 'management', 'team', 'and', 'strong', 'support', 'from', 'our', 'key', 'shareholders', 'financial', 'strength', 'being', 'debt', 'free', 'we', 'will', 'use', 'the', 'steady', 'income', 'from', 'our', 'onshore', 'and', 'offshore', 'gas', 'production', 'to', 'underwrite', 'our', 'operating', 'costs', 'support', 'asset', 'maturation', 'and', 'small', 'capital', 'programmes', 'mediterranean_oil', 'and', 'gas', 'plc', 'annual', 'report', 'www_medoilgas', 'business', 'review', 'corporate', 'governance', 'financial', 'statements', 'balanced', 'portfolio', 'use', 'our', 'resources', 'f

In [11]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [12]:
if not data_words_nostops:
# Remove Stop Words
    data_words_nostops = remove_stopwords(data_words)
    save_pickle('data_words_nostops', data_words_nostops, gensim_files_dir)

if not data_words_bigrams:
    # Form Bigrams
    data_words_bigrams = make_bigrams(data_words_nostops)
    save_pickle('data_words_bigrams', data_words_bigrams, gensim_files_dir)

In [13]:
if not id2word or not texts:
    # Initialize spacy 'en' model, keeping only tagger component (for efficiency)
    # python3 -m spacy download en
    # To add the package in conda enter in terminal:
    # conda activate <envName>
    # spacy download en
    nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
    nlp.max_length = 1500000
    # Do lemmatization keeping only noun, adj, vb, adv
    data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

    print(data_lemmatized[:1])

In [14]:
if not id2word:
    # Create Dictionary
    id2word = corpora.Dictionary(data_lemmatized)
    save_pickle('id2word', id2word, gensim_files_dir)
    
# Create Corpus
if not texts:
    texts = data_lemmatized
    save_pickle('texts', texts, gensim_files_dir)
    
# Term Document Frequency
if not corpus:
    corpus = [id2word.doc2bow(text) for text in texts]
    save_pickle('corpus', corpus, gensim_files_dir)
    
# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 3), (3, 5), (4, 1), (5, 1), (6, 1), (7, 2), (8, 1), (9, 1), (10, 1), (11, 29), (12, 2), (13, 23), (14, 1), (15, 17), (16, 1), (17, 1), (18, 2), (19, 5), (20, 2), (21, 7), (22, 7), (23, 1), (24, 14), (25, 4), (26, 26), (27, 1), (28, 7), (29, 1), (30, 1), (31, 40), (32, 4), (33, 2), (34, 3), (35, 13), (36, 4), (37, 2), (38, 1), (39, 5), (40, 1), (41, 5), (42, 8), (43, 1), (44, 2), (45, 5), (46, 3), (47, 1), (48, 17), (49, 1), (50, 1), (51, 1), (52, 3), (53, 2), (54, 1), (55, 1), (56, 2), (57, 1), (58, 2), (59, 1), (60, 2), (61, 7), (62, 7), (63, 2), (64, 1), (65, 1), (66, 2), (67, 8), (68, 1), (69, 1), (70, 12), (71, 14), (72, 1), (73, 8), (74, 1), (75, 1), (76, 1), (77, 1), (78, 11), (79, 6), (80, 3), (81, 2), (82, 1), (83, 23), (84, 2), (85, 2), (86, 1), (87, 3), (88, 15), (89, 1), (90, 5), (91, 6), (92, 2), (93, 59), (94, 1), (95, 5), (96, 2), (97, 3), (98, 1), (99, 1), (100, 1), (101, 1), (102, 1), (103, 3), (104, 1), (105, 92), (106, 1), (107, 3), (108, 1), (10

In [15]:
id2word[0]

'abandon'

In [16]:
force_create = True
# Check if model exists
if os.path.exists('lda_model_trained.model') and force_create is False:
    pprint('Loading Model')
    lda_model = gensim.models.ldamodel.LdaModel.load('lda_model_trained.model')
elif force_create is True or not os.path.exists('lda_model_trained.model') :
    pprint('Building Model')
    # Build LDA model
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                               id2word=id2word,
                                               num_topics=4, 
                                               random_state=100,
                                               update_every=1,
                                               chunksize=100,
                                               passes=10,
                                               alpha='auto',
                                               per_word_topics=True)
    lda_model.save('lda_model_trained.model')

'Building Model'


In [17]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.016*"group" + 0.012*"year" + 0.010*"share" + 0.009*"director" + '
  '0.009*"report" + 0.009*"financial" + 0.009*"date" + 0.008*"company" + '
  '0.008*"business" + 0.007*"statement"'),
 (1,
  '0.059*"thea" + 0.040*"ofa" + 0.032*"anda" + 0.022*"toa" + 0.022*"ina" + '
  '0.009*"area" + 0.008*"isa" + 0.008*"shire" + 0.007*"astrazeneca" + '
  '0.007*"ona"'),
 (2,
  '0.019*"group" + 0.016*"year" + 0.015*"date" + 0.014*"share" + '
  '0.014*"financial" + 0.013*"company" + 0.013*"director" + 0.011*"statement" '
  '+ 0.009*"asset" + 0.009*"report"'),
 (3,
  '0.011*"financial" + 0.011*"group" + 0.009*"company" + 0.009*"date" + '
  '0.008*"asset" + 0.008*"statement" + 0.008*"year" + 0.008*"share" + '
  '0.008*"cost" + 0.007*"report"')]


In [18]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
save_pickle('vis', vis, gensim_files_dir)
vis

## Classify sections

In [19]:
with open('output/xmlParse/10023/10023_business_review.txt', mode='r') as testFile:
    txt = testFile.readlines()
txt

['The Company is required by the Companies Act 2006 to include a review of the business and likely future developments.This information is contained in the Chairmans Statement Chief Executives Report and the Finance Review on pages 6 to 15.\n',
 'Capital Structure The capital structure of the Company was significantly impacted by the Group Recapitalisation in May 2011.\n',
 'Details of the issued share capital together with details of the movements in the Companys issued share capital during the year are shown in Notes 20 24 and 25 to the Financial Statements.\n',
 'The Company has one class of ordinary shares which carry no right to fixed income.\n',
 'Each share carries the right to one vote at general meetings of the Company.\n',
 'In addition following the Group Recapitalisation the Company has one class of deferred shares which do not entitle the holder to receive any dividend or other distribution or to vote at general meetings of the Company.\n',
 'There are no specific restrict

In [20]:
# Remove dot and new line symbols from the sentences
txt = [sent.replace('.\n', '') for sent in txt]
txt

['The Company is required by the Companies Act 2006 to include a review of the business and likely future developments.This information is contained in the Chairmans Statement Chief Executives Report and the Finance Review on pages 6 to 15',
 'Capital Structure The capital structure of the Company was significantly impacted by the Group Recapitalisation in May 2011',
 'Details of the issued share capital together with details of the movements in the Companys issued share capital during the year are shown in Notes 20 24 and 25 to the Financial Statements',
 'The Company has one class of ordinary shares which carry no right to fixed income',
 'Each share carries the right to one vote at general meetings of the Company',
 'In addition following the Group Recapitalisation the Company has one class of deferred shares which do not entitle the holder to receive any dividend or other distribution or to vote at general meetings of the Company',
 'There are no specific restrictions on the size o

In [21]:
local_words = list(sent_to_words(txt))
local_words

[['the',
  'company',
  'is',
  'required',
  'by',
  'the',
  'companies',
  'act',
  'to',
  'include',
  'review',
  'of',
  'the',
  'business',
  'and',
  'likely',
  'future',
  'developments',
  'this',
  'information',
  'is',
  'contained',
  'in',
  'the',
  'chairmans',
  'statement',
  'chief',
  'executives',
  'report',
  'and',
  'the',
  'finance',
  'review',
  'on',
  'pages',
  'to'],
 ['capital',
  'structure',
  'the',
  'capital',
  'structure',
  'of',
  'the',
  'company',
  'was',
  'significantly',
  'impacted',
  'by',
  'the',
  'group',
  'in',
  'may'],
 ['details',
  'of',
  'the',
  'issued',
  'share',
  'capital',
  'together',
  'with',
  'details',
  'of',
  'the',
  'movements',
  'in',
  'the',
  'companys',
  'issued',
  'share',
  'capital',
  'during',
  'the',
  'year',
  'are',
  'shown',
  'in',
  'notes',
  'and',
  'to',
  'the',
  'financial',
  'statements'],
 ['the',
  'company',
  'has',
  'one',
  'class',
  'of',
  'ordinary',
  'shar

In [22]:
# Remove Stop Words
local_words_nostops = remove_stopwords(local_words)
local_words_nostops

[['company',
  'required',
  'companies',
  'act',
  'include',
  'review',
  'business',
  'likely',
  'future',
  'developments',
  'information',
  'contained',
  'chairmans',
  'statement',
  'chief',
  'executives',
  'report',
  'finance',
  'review',
  'pages'],
 ['capital',
  'structure',
  'capital',
  'structure',
  'company',
  'significantly',
  'impacted',
  'group',
  'may'],
 ['details',
  'issued',
  'share',
  'capital',
  'together',
  'details',
  'movements',
  'companys',
  'issued',
  'share',
  'capital',
  'year',
  'shown',
  'notes',
  'financial',
  'statements'],
 ['company',
  'one',
  'class',
  'ordinary',
  'shares',
  'carry',
  'right',
  'fixed',
  'income'],
 ['share',
  'carries',
  'right',
  'one',
  'vote',
  'general',
  'meetings',
  'company'],
 ['addition',
  'following',
  'group',
  'company',
  'one',
  'class',
  'deferred',
  'shares',
  'entitle',
  'holder',
  'receive',
  'dividend',
  'distribution',
  'vote',
  'general',
  'meeting

In [23]:
other_corpus = [id2word.doc2bow(text) for text in local_words_nostops]
other_corpus[0]

[(26, 1),
 (254, 1),
 (315, 1),
 (367, 1),
 (771, 1),
 (833, 1),
 (954, 1),
 (977, 1),
 (1109, 1),
 (1652, 1),
 (1699, 2),
 (1858, 1),
 (4052, 1),
 (63651, 1),
 (91872, 1)]

In [24]:
res = lda_model[other_corpus[0]]
res

([(0, 0.4098484), (2, 0.5807971)],
 [(26, [2, 0]),
  (254, [0, 2]),
  (315, [2, 0]),
  (367, [2, 0]),
  (771, [2, 0]),
  (833, [2, 0]),
  (954, [2, 0]),
  (977, [2, 0]),
  (1109, [2, 0]),
  (1652, [2, 0]),
  (1699, [2, 0]),
  (1858, [2, 0]),
  (4052, [2, 0]),
  (63651, [2, 0]),
  (91872, [2])],
 [(26, [(0, 0.23059973), (2, 0.7692993)]),
  (254, [(0, 0.53518057), (2, 0.4647955)]),
  (315, [(0, 0.4453662), (2, 0.5545347)]),
  (367, [(0, 0.2971389), (2, 0.7028411)]),
  (771, [(0, 0.31935737), (2, 0.68058115)]),
  (833, [(0, 0.39826298), (2, 0.60165846)]),
  (954, [(0, 0.48054186), (2, 0.5194196)]),
  (977, [(0, 0.44776267), (2, 0.5521779)]),
  (1109, [(0, 0.41901943), (2, 0.5803123)]),
  (1652, [(0, 0.4187561), (2, 0.581221)]),
  (1699, [(0, 0.9659782), (2, 1.0339314)]),
  (1858, [(0, 0.3047906), (2, 0.69518644)]),
  (4052, [(0, 0.46915948), (2, 0.52152467)]),
  (63651, [(0, 0.012495199), (2, 0.019925948)]),
  (91872, [(2, 0.011373609)])])

In [25]:
res[0]

[(0, 0.4098484), (2, 0.5807971)]

In [26]:
max(res[0], key = lambda item:item[1])

(2, 0.5807971)

In [27]:
len(res)

3

In [28]:
def classify_sentences(current_text, current_model):
    result_list = list()
    # Remove dot and new line symbols from the sentences
    txt = [sent.replace('.\n', '') for sent in current_text]
    local_words = list(sent_to_words(txt))
    # Remove Stop Words
    local_words_nostops = remove_stopwords(local_words)
    
    other_corpus = [id2word.doc2bow(text) for text in local_words_nostops]
    for i in range(len(current_text)):
        res = current_model[other_corpus[i]]
        result_list.append(res[0])
    return result_list

In [29]:
# Returns list as number of sentences and the clasiffication data for them
class_data = classify_sentences(txt, lda_model)
class_data

[[(0, 0.40578693), (2, 0.5848586)],
 [(0, 0.3250929), (2, 0.66092896), (3, 0.0113860145)],
 [(0, 0.060012203), (2, 0.9296433)],
 [(0, 0.13525267), (2, 0.8493725), (3, 0.012523283)],
 [(0, 0.18271296), (2, 0.7979494), (3, 0.01575057)],
 [(0, 0.1881984), (2, 0.8029826)],
 [(0, 0.31334195), (2, 0.6712602), (3, 0.012541767)],
 [(0, 0.23344456), (2, 0.7472428), (3, 0.015731208)],
 [(0, 0.15101391), (2, 0.8259365), (3, 0.01877334)],
 [(0, 0.11518878), (2, 0.8696095), (3, 0.012381966)],
 [(0, 0.81928277), (2, 0.16810448), (3, 0.010273785)]]

In [30]:
# Returns list of maximum values for each sentence
final_results_list = list()
for i in range(len(class_data)):
    final_results_list.append(max(class_data[i], key = lambda item:item[1]))

In [31]:
# Here we can see that this section can be classified as topic number 2
final_results_list

[(2, 0.5848586),
 (2, 0.66092896),
 (2, 0.9296433),
 (2, 0.8493725),
 (2, 0.7979494),
 (2, 0.8029826),
 (2, 0.6712602),
 (2, 0.7472428),
 (2, 0.8259365),
 (2, 0.8696095),
 (0, 0.81928277)]