# Topic Modelling with LSA
source: https://www.analyticsvidhya.com/blog/2018/10/stepwise-guide-topic-modeling-latent-semantic-analysis/

In [21]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
pd.set_option("display.max_colwidth", 200)

# Import data

In [12]:
import pandas as pd
data = pd.read_csv("data/data1.csv")
features = data['title'] + " " + data['article']
# Convert to list
# features = features.tolist()
features

0    (EU 17 2017 establishment Union framework collection management use data fisheries sector Council Regulation (EC 1.With view (EU Regulation management biological, environmental, technical socioeco...
1    Regulation (EU) 2019/833 European Parliament Council conservation enforcement measures Regulatory Area Northwest Atlantic Fisheries Organisation Regulation (EU Council Regulations EC (EC 1.This Re...
2    Regulation (EU 1303/2013 European Parliament Council 17 December common provisions European Regional Development Fund European Social Fund Cohesion Fund European Agricultural Fund Rural Developmen...
3    Regulation (EU) 2019/473 European Parliament Council 19 March 2019 European Fisheries Control Agency Regulation provision European Fisheries Control Agency the Agency operational coordination Memb...
dtype: object

# Pre-processing

In [30]:
# removing everything except alphabets`
def pre_processing(features):
    new_features = features.str.replace("[^a-zA-Z#]", " ")

    # removing short words
    new_features = new_features.apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

    # make all text lowercase
    new_features = new_features.apply(lambda x: x.lower())

    # tokenization
    new_features = new_features.apply(lambda x: x.split())

    # remove stop-words

    stop_words.extend(['programme','accordance','article', 'state','member','this','annex','paragraph'])
    new_features = new_features.apply(lambda x: [item for item in x if item not in stop_words])

    # # de-tokenization
    # detokenized_doc = []
    # for i in range(len(news_df)):
    #     t = ' '.join(tokenized_doc[i])
    #     detokenized_doc.append(t)
    #
    # news_df['clean_doc'] = detokenized_doc

    return  new_features

pre_processing(features)

  new_features = features.str.replace("[^a-zA-Z#]", " ")


0    [establishment, union, framework, collection, management, data, fisheries, sector, council, regulation, view, regulation, management, biological, environmental, technical, socioeconomic, data, fis...
1    [regulation, european, parliament, council, conservation, enforcement, measures, regulatory, area, northwest, atlantic, fisheries, organisation, regulation, council, regulations, regulation, union...
2    [regulation, european, parliament, council, december, common, provisions, european, regional, development, fund, european, social, fund, cohesion, fund, european, agricultural, fund, rural, develo...
3    [regulation, european, parliament, council, march, european, fisheries, control, agency, regulation, provision, european, fisheries, control, agency, agency, operational, coordination, states, com...
dtype: object

# Document-Term Matrix

In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english',
max_features= 100, # keep top 100 terms
max_df = 0.5,
smooth_idf=True)

X = vectorizer.fit_transform(features)

X.shape # check shape of the document-term matrix

(4, 100)

In [38]:
from sklearn.decomposition import TruncatedSVD

# SVD represent documents and terms in vectors
svd_model = TruncatedSVD(n_components=4, algorithm='randomized', n_iter=100, random_state=122)

svd_model.fit(X)

len(svd_model.components_)


4

In [45]:
terms = vectorizer.get_feature_names()
# print(terms)

for i, comp in enumerate(svd_model.components_):
    terms_comp = zip(terms, comp)
    sorted_terms = sorted(terms_comp, key= lambda x:x[1], reverse=True)[:7]
    print("Topic "+str(i)+": ")
    for t in sorted_terms:
        print(t)

        # print(" ")

Topic 0: 
('inspection', 0.5458921720103344)
('vessel', 0.42433304989011494)
('director', 0.32659968907951076)
('deployment', 0.2537436951796634)
('scientific', 0.19465496309628796)
('port', 0.1656213092988408)
('flag', 0.16317756469331673)
Topic 1: 
('scientific', 0.747621190790449)
('groups', 0.25908587375138703)
('funds', 0.20524221283240723)
('fund', 0.18391834656410525)
('esi', 0.13238566974904195)
('committee', 0.1113511362400589)
('examination', 0.10959282858156098)
Topic 2: 
('funds', 0.3514757707807913)
('fund', 0.31495880758278694)
('esi', 0.2267094798542766)
('subparagraph', 0.16036575994800092)
('expenditure', 0.1599840427602731)
('partnership', 0.1430247725255168)
('instruments', 0.1362274480059432)
Topic 3: 
('director', 0.4030157570075093)
('inspection', 0.3520367742535239)
('deployment', 0.30189538340494254)
('scientific', 0.22051406947245353)
('budget', 0.12391133417601838)
('groups', 0.07155751606669539)
('revenue', 0.05255205943651698)
