# Part 1 : 

# 1/ Extract data :

## Create an empty df 

## Extract corpus (Web scrapping on XML) and 1st article cleaning (regex)

## Add extracted data in the df and save the df

#  ____________________________________________________________________________________________

# Part 2 :

# 2 / Pre-processing :

## Open dataset :

In [1]:
#read Wiki_corpus.pkl :
import pandas as pd

df_corpus = pd.read_pickle("..//DATA//Wiki_corpus.pkl")  
df_corpus

Unnamed: 0,article_nb,corpus
0,article_2,Anarchism is a political philosophy and Polit...
1,article_66,Alabama Alabama is nicknamed the Northern fli...
2,article_68,In Greek mythology Achilles was a hero of the ...
3,article_71,collapsible list collapsible list collapsible...
4,article_72,An American in Paris is a jazz influenced orc...
...,...,...
50252,article2_83521,Thornhill is a village and civil parish in th...
50253,article2_83525,Walton is a village and civil parishes in Eng...
50254,article2_83531,Walton Hall may refer to Walton Hall Cheshire...
50255,article2_83534,In linear algebra a Toeplitz matrix or diagona...


## Select data for training (aka, training dataset)

In [2]:
# Randomly sample 80% of 73000 articles from the corpus for training :
from sklearn.model_selection import train_test_split

df_corpus_train, df_corpus_test = train_test_split(df_corpus, test_size=0.2, random_state=42)
df_corpus_train

Unnamed: 0,article_nb,corpus
23542,article2_29523,The variant spelling Elphin may refer to Sain...
7238,article_16334,Pacific Overtures is a Musical theater musica...
26840,article2_39022,The Countryside Agency was a statutory body s...
39117,article2_62265,Springport is a Administrative divisions of N...
12229,article2_165,A noise weighting is a specific amplitude vs f...
...,...,...
11284,article_25358,Automatic taxobox A frog is any member of a d...
44732,article2_71638,Geronimo is an unincorporated community and c...
38158,article2_60810,Torning Township is a township in Swift Count...
860,article_2189,In baseball statistics total bases does not in...


In [3]:
# save df_corpus_test as csv :
df_corpus_test.to_pickle("..//DATA//Wiki_corpus_test.pkl") 

## Remove stop words from sentences & lemmatize words

In [4]:
# Function to remove stop words from sentences & lemmatize words. (pass the article text as string "doc")
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string

stop = set(stopwords.words('english'))
#exclude = set(string.punctuation) #remove punctuation, but useless here.
lemma = WordNetLemmatizer()

def clean(doc):
    
    # remove stop words & punctuation, and lemmatize words
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    #p_free  = ''.join(ch for ch in stop_free if ch not in exclude) #remove punctuation, but useless here because they are already deleted.
    normalized = " ".join(lemma.lemmatize(word,'v') for word in stop_free.split())
    x = normalized.split()
    
    # only take words which are greater than 2 characters
    y = [s for s in x if len(s) > 2]
    return y

In [5]:
# Cleaning all the df_corpus_train articles :
df_corpus_train["corpus"] = df_corpus_train['corpus'].apply(clean)
df_corpus_train

Unnamed: 0,article_nb,corpus
23542,article2_29523,"[variant, spell, elphin, may, refer, saint, el..."
7238,article_16334,"[pacific, overtures, musical, theater, musical..."
26840,article2_39022,"[countryside, agency, statutory, body, set, en..."
39117,article2_62265,"[springport, administrative, divisions, new, y..."
12229,article2_165,"[noise, weight, specific, amplitude, frequency..."
...,...,...
11284,article_25358,"[automatic, taxobox, frog, member, diverse, la..."
44732,article2_71638,"[geronimo, unincorporated, community, census, ..."
38158,article2_60810,"[torning, township, township, swift, county, m..."
860,article_2189,"[baseball, statistics, total, base, increase, ..."


# ________________________________________________________________________________

# 3/ Building word dictionnary :

## Creating term dictionary of corpus, where each unique term is assigned an index

In [6]:
# Creating term dictionary of corpus, where each unique term is assigned an index.
from gensim import corpora

dictionary = corpora.Dictionary(df_corpus_train["corpus"])
print(dictionary)

Dictionary<90438 unique tokens: ['also', 'attribute', 'book', 'cathedral', 'ceredigion']...>


## Filter terms which occurs in less than 4 articles (aim to reduce overfitting) & more than 40% of the articles (aim to reduce underfitting)

In [7]:
# Filter terms which occurs in less than 4 articles (aim to reduce overfitting) & more than 40% of the articles (aim to reduce underfitting)
"""
All the tokens in the dictionary which either have occurred in less than 4 articles or have occurred in more than 40% of the 
articles are removed from the dictionary, as these words will not be contributing to the various themes or topics.
"""

dictionary.filter_extremes(no_below=4, no_above=0.4)
print(dictionary)

Dictionary<24162 unique tokens: ['also', 'attribute', 'book', 'cathedral', 'ceredigion']...>


## List of few words which are removed from dictionary as they are content neutral

In [8]:
# List of few words which are removed from dictionary as they are content neutral

"""
After printing the most frequent words of the dictionary, we found that few words which are mostly content neutral words are also 
present in the dictionary. These words may lead to modeling of “word distribution”(topic) which is neutral and do not capture any 
theme or content. We made a list of such words and filtered all such words.
"""

stoplist = set('also use make people know many call include part find become like mean often different \
               usually take wikt come give well get since type list say change see refer actually iii \
               aisne kinds pas ask would way something need things want every str'.split())
stop_ids = [dictionary.token2id[stopword] for stopword in stoplist if stopword in dictionary.token2id]
dictionary.filter_tokens(stop_ids)
print(dictionary)

Dictionary<24122 unique tokens: ['attribute', 'book', 'cathedral', 'ceredigion', 'composition']...>


## Analyse The most frequent words with their respective frequencies

In [9]:
# Convert the dictionary into the desired format
words_frequency = list(dictionary.token2id.items())
ids = list(dictionary.token2id.values())

# Sort the words and frequencies by frequency in descending order
words_frequency.sort(key=lambda x: x[1], reverse=False)
ids.sort(key=lambda x: dictionary[x], reverse=False)

# Print the result : from most frequent to least frequent, here the most frequent word is "age"
print("Words Frequency:")
print(words_frequency) # example : "age" is the most frequent word ('age', 0), ('word', classement of the frequence)

# Each word is also given a unique id in the vocabulary (dictionary) :
print("\nIDs:")
print(ids) # List of Ids

Words Frequency:
[('attribute', 0), ('book', 1), ('cathedral', 2), ('ceredigion', 3), ('composition', 4), ('compositions', 5), ('conquest', 6), ('county', 7), ('date', 8), ('detail', 9), ('diocese', 10), ('earliest', 11), ('example', 12), ('ireland', 13), ('late', 14), ('lord', 15), ('may', 16), ('medieval', 17), ('mythological', 18), ('mythology', 19), ('name', 20), ('norman', 21), ('occur', 22), ('poems', 23), ('predate', 24), ('probably', 25), ('roscommon', 26), ('saint', 27), ('scotland', 28), ('several', 29), ('son', 30), ('spell', 31), ('sutherland', 32), ('taliesin', 33), ('town', 34), ('uncertain', 35), ('variant', 36), ('village', 37), ('welsh', 38), ('abundance', 39), ('actors', 40), ('additional', 41), ('american', 42), ('appear', 43), ('asian', 44), ('audience', 45), ('award', 46), ('away', 47), ('black', 48), ('broadway', 49), ('cast', 50), ('catch', 51), ('century', 52), ('classic', 53), ('close', 54), ('clothe', 55), ('company', 56), ('contemplation', 57), ('contrast', 5

# __________________________________________________________________________________

# 4/ Feature extraction (Bag of Words) :

## Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.

In [10]:
# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in df_corpus_train["corpus"]]
doc_term_matrix

[[(0, 1),
  (1, 1),
  (2, 1),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 1),
  (7, 2),
  (8, 1),
  (9, 1),
  (10, 1),
  (11, 1),
  (12, 1),
  (13, 2),
  (14, 1),
  (15, 1),
  (16, 1),
  (17, 1),
  (18, 1),
  (19, 1),
  (20, 1),
  (21, 1),
  (22, 1),
  (23, 1),
  (24, 1),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 2),
  (30, 1),
  (31, 1),
  (32, 1),
  (33, 3),
  (34, 2),
  (35, 1),
  (36, 1),
  (37, 1),
  (38, 1)],
 [(1, 2),
  (39, 1),
  (40, 1),
  (41, 1),
  (42, 1),
  (43, 1),
  (44, 1),
  (45, 1),
  (46, 1),
  (47, 1),
  (48, 1),
  (49, 3),
  (50, 4),
  (51, 1),
  (52, 1),
  (53, 1),
  (54, 1),
  (55, 1),
  (56, 2),
  (57, 1),
  (58, 1),
  (59, 1),
  (60, 1),
  (61, 1),
  (62, 1),
  (63, 1),
  (64, 1),
  (65, 1),
  (66, 2),
  (67, 3),
  (68, 1),
  (69, 2),
  (70, 1),
  (71, 1),
  (72, 1),
  (73, 1),
  (74, 1),
  (75, 1),
  (76, 1),
  (77, 1),
  (78, 1),
  (79, 3),
  (80, 3),
  (81, 1),
  (82, 1),
  (83, 1),
  (84, 1),
  (85, 1),
  (86, 1),
  (87, 1),
  (88, 1),
  (89, 1),
  (90, 2)

### Example on article1_9788 :

# ______________________________________________________________________________________

# 5/ LDA Model Training :

## Naiv modeling 

In [17]:
from gensim.models.ldamodel import LdaModel as Lda
# Creating the object for LDA model using gensim library & Training LDA model on the document term matrix.
ldamodel = Lda(corpus=doc_term_matrix, id2word = dictionary, num_topics=60, random_state=42, alpha="symmetric", eta=0.01) #iterations=500,
ldamodel

<gensim.models.ldamodel.LdaModel at 0x24f5a72f8d0>

In [18]:
from gensim.models import CoherenceModel

coherence_model = CoherenceModel(model=ldamodel, texts=df_corpus_train["corpus"], dictionary=dictionary, coherence='c_v')
coherence_score = coherence_model.get_coherence()
coherence_score

0.5022666562981913

# Optimize hyper-parameters

In [19]:
# Optimisation des hyperparamètres :
#Bayesian Optimisation (optuna) :
from gensim.models import CoherenceModel
import optuna

def objective(trial):
    alpha = trial.suggest_categorical('alpha', ['symmetric', 'asymmetric']) #nb of tree
    beta = trial.suggest_float('beta', 0.01, 1) #profondeur
    #num_iterations = trial.suggest_catgorical('num_iterations', 100, 500)
    num_topics = trial.suggest_int('num_topics', 10, 110, step=10)

    model = Lda(random_state=42, corpus=doc_term_matrix, id2word=dictionary, num_topics=num_topics, alpha=alpha, eta=beta) #iterations=500

    coherence_model = CoherenceModel(model=model, texts=df_corpus_train["corpus"], dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model.get_coherence() 
    return coherence_score


study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

[32m[I 2023-07-23 21:15:55,953][0m A new study created in memory with name: no-name-4d194db1-86eb-488c-9574-fbf54ef5e0d4[0m
[32m[I 2023-07-23 21:16:22,409][0m Trial 0 finished with value: 0.5748625190664874 and parameters: {'alpha': 'symmetric', 'beta': 0.6224078425643267, 'num_topics': 10}. Best is trial 0 with value: 0.5748625190664874.[0m
[32m[I 2023-07-23 21:16:59,216][0m Trial 1 finished with value: 0.5261468593855865 and parameters: {'alpha': 'asymmetric', 'beta': 0.9965953065999386, 'num_topics': 40}. Best is trial 0 with value: 0.5748625190664874.[0m
[32m[I 2023-07-23 21:18:26,662][0m Trial 2 finished with value: 0.5125560013860007 and parameters: {'alpha': 'symmetric', 'beta': 0.8771941039125456, 'num_topics': 100}. Best is trial 0 with value: 0.5748625190664874.[0m
[32m[I 2023-07-23 21:19:40,336][0m Trial 3 finished with value: 0.49182282340976063 and parameters: {'alpha': 'symmetric', 'beta': 0.12930112009039063, 'num_topics': 80}. Best is trial 0 with value: 0

In [20]:
trial = study.best_trial
print('coherence_score: {}'.format(trial.value))
print("Best hyperparameters: {}".format(trial.params))

coherence_score: 0.6105552076920123
Best hyperparameters: {'alpha': 'asymmetric', 'beta': 0.5806203700038874, 'num_topics': 10}


## Ajust the model with best hyper parameters 

In [21]:
#Ajust the model with best hyper parameters :
best_ldamodel = Lda(random_state=42, corpus=doc_term_matrix, id2word=dictionary, num_topics=(trial.params)["num_topics"], alpha=(trial.params)["alpha"], eta=(trial.params)["beta"])

## Analyse the result of all topics 

In [22]:
# Analyse the result Print all topics :
list_topic = best_ldamodel.print_topics(num_words=10) #num_topics=50
list_topic

[(0,
  '0.013*"park" + 0.012*"south" + 0.012*"national" + 0.011*"river" + 0.010*"north" + 0.010*"city" + 0.010*"area" + 0.008*"west" + 0.008*"east" + 0.006*"town"'),
 (1,
  '0.068*"new" + 0.039*"york" + 0.035*"town" + 0.034*"county" + 0.019*"wisconsin" + 0.017*"census" + 0.017*"jersey" + 0.017*"population" + 0.013*"florida" + 0.012*"city"'),
 (2,
  '0.113*"county" + 0.046*"census" + 0.041*"township" + 0.041*"population" + 0.024*"area" + 0.020*"city" + 0.019*"pennsylvania" + 0.014*"metropolitan" + 0.012*"statistical" + 0.012*"minnesota"'),
 (3,
  '0.037*"year" + 0.021*"calendar" + 0.016*"name" + 0.014*"period" + 0.014*"species" + 0.014*"years" + 0.013*"europe" + 0.013*"era" + 0.012*"early" + 0.010*"medieval"'),
 (4,
  '0.008*"church" + 0.007*"ancient" + 0.007*"roman" + 0.007*"king" + 0.007*"century" + 0.006*"greek" + 0.005*"name" + 0.005*"empire" + 0.005*"son" + 0.005*"saw"'),
 (5,
  '0.011*"war" + 0.009*"party" + 0.009*"government" + 0.008*"president" + 0.006*"right" + 0.006*"law" + 0.

## Save the model and the list of topics

In [23]:
# Save the LDA model : dump LDA model using pickle for future use
import pickle

ldafile = open('..//MODEL//lda_model_sym_wiki.pkl','wb')
pickle.dump(best_ldamodel,ldafile)
ldafile.close()

In [24]:
#Save the list of topic in file.txt :
open("..//REPORTS//OUTPUT//List_of_topic.txt", "wb")

with open("..//REPORTS//OUTPUT//List_of_topic.txt", "w") as f :
    for topic in list_topic :
        f.write(str(topic) + "\n")
f.close()

# _____________________________________________________________________________________

# Part 3 :

## Load the LDA model

In [25]:
# Load the LDA model : dump LDA model using pickle for future use
import pickle

lda_model = pickle.load(open('..//MODEL//lda_model_sym_wiki.pkl','rb'))

## Open test data

In [26]:
#read Wiki_corpus_test.pkl :
import pandas as pd

df_corpus_test = pd.read_pickle("..//DATA//Wiki_corpus_test.pkl")  
df_corpus_test

Unnamed: 0,article_nb,corpus
1640,article_3877,This important documentation of the early stag...
17967,article2_13473,A health system health care system or healthca...
32228,article2_49108,Mauckport is a town in Heth Township Harrison...
32078,article2_48931,Hartford City is a city in the U S state of I...
42526,article2_67980,Cussewago Township is a township township in ...
...,...,...
43611,article2_69632,US Census population Richmond Township is a t...
11416,article_25796,The Tandy is the first in a line of IBM PC co...
27292,article2_40167,Coolah Tops is a national park located in New...
31847,article2_48557,Leaf River is a village in Leaf River Townshi...


# Pre-processing on test set :

In [27]:
# 1/ Clean article with regex :
import re

def is_ascii(s):
    return all(ord(c) < 128 for c in s)


def clean_with_regex(article_txt) :
    if not article_txt == None:  
        # Extracting the text portion from the article                                              
        article_txt = article_txt[ : article_txt.find("==")]

        # remove text written between double curly braces
        article_txt = re.sub(r"{{.*}}","",article_txt)

        # remove file attachments
        article_txt = re.sub(r"\[\[File:.*\]\]","",article_txt)

        # remove Image attachments
        article_txt = re.sub(r"\[\[Image:.*\]\]","",article_txt)

        # remove unwanted lines starting from special characters
        article_txt = re.sub(r"\n: \'\'.*","",article_txt)
        article_txt = re.sub(r"\n!.*","",article_txt)
        article_txt = re.sub(r"^:\'\'.*","",article_txt)

        #  remove non-breaking space symbols
        article_txt = re.sub(r"&nbsp","",article_txt)

        # remove URLs link
        article_txt = re.sub(r"http\S+","",article_txt)

        # remove digits from text
        article_txt = re.sub(r"\d+","",article_txt)   

        # remove text written between small braces
        article_txt = re.sub(r"\(.*\)","",article_txt)

        # remove sentence which tells category of article
        article_txt = re.sub(r"Category:.*","",article_txt)

        # remove the sentences inside infobox or taxobox
        article_txt = re.sub(r"\| .*","",article_txt)
        article_txt = re.sub(r"\n\|.*","",article_txt)
        article_txt = re.sub(r"\n \|.*","",article_txt)
        article_txt = re.sub(r".* \|\n","",article_txt)
        article_txt = re.sub(r".*\|\n","",article_txt)

        # remove infobox or taxobox
        article_txt = re.sub(r"{{Infobox.*","",article_txt)
        article_txt = re.sub(r"{{infobox.*","",article_txt)
        article_txt = re.sub(r"{{taxobox.*","",article_txt)
        article_txt = re.sub(r"{{Taxobox.*","",article_txt)
        article_txt = re.sub(r"{{ Infobox.*","",article_txt)
        article_txt = re.sub(r"{{ infobox.*","",article_txt)
        article_txt = re.sub(r"{{ taxobox.*","",article_txt)
        article_txt = re.sub(r"{{ Taxobox.*","",article_txt)

        # remove lines starting from *
        article_txt = re.sub(r"\* .*","",article_txt)

        # remove text written between angle bracket
        article_txt = re.sub(r"<.*>","",article_txt)

        # remove new line character
        article_txt = re.sub(r"\n","",article_txt)  

        # replace all punctuations with space
        article_txt = re.sub(r"\!|\"|\#|\$|\%|\&|\'|\(|\)|\*|\+|\,|\-|\.|\/|\:|\;|\<|\=|\>|\?|\@|\[|\\|\]|\^|\_|\`|\{|\||\}|\~"," ",article_txt)

        # replace consecutive multiple space with single space
        article_txt = re.sub(r" +"," ",article_txt)

        # replace non-breaking space with regular space
        article_txt = article_txt.replace(u'\xa0', u' ')
        
        if is_ascii(article_txt):
            return article_txt
        else :
            return ""
        

#df_corpus_test["corpus"] = df_corpus_test["corpus"].apply(clean_with_regex)

In [28]:
# 2/ drop empty and useless article :
#df_corpus_test =  df_corpus_test[(df_corpus_test['corpus'] != None) & (df_corpus_test['corpus'] != "") & (len(df_corpus_test['corpus']) > 150)]
#df_corpus_test

In [29]:
# 3/ Clean article with stop word and lemmatization :

# Function to remove stop words from sentences & lemmatize words. (pass the article text as string "doc")
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string

stop = set(stopwords.words('english'))
#exclude = set(string.punctuation) #remove punctuation, but useless here.
lemma = WordNetLemmatizer()


def clean_with_stopword_lemmatization(doc):
    
    # remove stop words & punctuation, and lemmatize words
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    #p_free  = ''.join(ch for ch in stop_free if ch not in exclude) #remove punctuation, but useless here because they are already deleted.
    normalized = " ".join(lemma.lemmatize(word,'v') for word in stop_free.split())
    x = normalized.split()
    
    # only take words which are greater than 2 characters
    y = [s for s in x if len(s) > 2]
    return y

#df_corpus_test["corpus"] = df_corpus_test['corpus'].apply(clean_with_stopword_lemmatization)
#df_corpus_test

# _______________________________________________________________________________________

# 7/ Document clustering :

In [31]:
# Clustering articles
from operator import itemgetter

def cluster_similar_documents(doc):
    #rename topics (unknown if the topic looks neutral):
    topics = ["Geographic Locations and Nature", "New York and Wisconsin Towns", "Census Data and Statistical Information", 
              "Historical Periods and Eras", "Ancient History and Roman Empire", "Politics and Government", "Computer Science and Programming", 
              "Mathematical Concepts and Space", "Games, Universities, and Design", "Entertainment Industry"]

    #Pre-processing :
    doc = clean_with_regex(doc)
    doc = doc if (doc != None) & (len(doc) > 150) else ""
    doc = clean_with_stopword_lemmatization(doc)
    
    #doc_bow
    doc_bow = lda_model.id2word.doc2bow(doc)
    
    #doc_topics
    doc_topics = lda_model.get_document_topics(doc_bow, minimum_probability=0.05)
    
    # return the most pertinent topic :
    if doc_topics:
        doc_topics.sort(key = itemgetter(1), reverse=True)
        theme = topics[doc_topics[0][0]]
        theme_proba = round(doc_topics[0][1], 3)
        if theme == "unknown":
            try :
                theme = topics[doc_topics[1][0]]
                theme_proba = round(doc_topics[1][1], 3)
            except IndexError :
                theme = topics[doc_topics[0][0]]
                theme_proba = round(doc_topics[0][1], 3) 
    else:
        theme = "unknown"
        theme_proba = 0
        
    return pd.Series([theme, theme_proba])

df_corpus_test[["topic", "topic_proba"]] = df_corpus_test['corpus'].apply(lambda corpus : cluster_similar_documents(corpus))
df_corpus_test

Unnamed: 0,article_nb,corpus,topic,topic_proba
1640,article_3877,This important documentation of the early stag...,Computer Science and Programming,0.973
17967,article2_13473,A health system health care system or healthca...,Computer Science and Programming,0.856
32228,article2_49108,Mauckport is a town in Heth Township Harrison...,Census Data and Statistical Information,0.951
32078,article2_48931,Hartford City is a city in the U S state of I...,Geographic Locations and Nature,0.470
42526,article2_67980,Cussewago Township is a township township in ...,Geographic Locations and Nature,0.203
...,...,...,...,...
43611,article2_69632,US Census population Richmond Township is a t...,Census Data and Statistical Information,0.937
11416,article_25796,The Tandy is the first in a line of IBM PC co...,Computer Science and Programming,0.705
27292,article2_40167,Coolah Tops is a national park located in New...,Geographic Locations and Nature,0.886
31847,article2_48557,Leaf River is a village in Leaf River Townshi...,Census Data and Statistical Information,0.757


In [32]:
# Analyse the result of the clustering
pd.DataFrame(df_corpus_test['topic'].value_counts()).rename(columns={"topic": "nb_of_article"})

Unnamed: 0,nb_of_article
Census Data and Statistical Information,2970
Mathematical Concepts and Space,1156
Entertainment Industry,1076
Geographic Locations and Nature,885
New York and Wisconsin Towns,858
Ancient History and Roman Empire,784
Computer Science and Programming,780
Politics and Government,561
Historical Periods and Eras,516
"Games, Universities, and Design",466


# ______________________________________________________________________________________

# 8/ Theme Extraction :

In [36]:
article = "Mohandas Karamchand Gandhi[14] was born on 2 October 1869[1] to a Hindu Modh Baniya family[15] in Porbandar (also known as Sudamapuri), a coastal town on the Kathiawar Peninsula and then part of the small princely state of Porbandar in the Kathiawar Agency of the Indian Empire. His father, Karamchand Uttamchand Gandhi (1822–1885), served as the diwan (chief minister) of Porbandar state.[16] Although he only had an elementary education and had previously been a clerk in the state administration, Karamchand proved a capable chief minister.[17] During his tenure, Karamchand married four times. His first two wives died young, after each had given birth to a daughter, and his third marriage was childless. In 1857, Karamchand sought his third wife's permission to remarry; that year, he married Putlibai (1844–1891), who also came from Junagadh,[18] and was from a Pranami Vaishnava family.[19][20][21][22] Karamchand and Putlibai had three children over the ensuing decade, a son, Laxmidas (c. 1860 – March 1914), a daughter, Raliatbehn (1862–1960) and another son, Karsandas (c. 1866–1913)"

def Theme_extraction(article) :
    print(f"For the given article : {article}")
    print(f"Topic -> {cluster_similar_documents(article)[0]}")
    print((f"Topic proba -> {round(cluster_similar_documents(article)[1], 3)}"))
    
Theme_extraction(article)

For the given article : Mohandas Karamchand Gandhi[14] was born on 2 October 1869[1] to a Hindu Modh Baniya family[15] in Porbandar (also known as Sudamapuri), a coastal town on the Kathiawar Peninsula and then part of the small princely state of Porbandar in the Kathiawar Agency of the Indian Empire. His father, Karamchand Uttamchand Gandhi (1822–1885), served as the diwan (chief minister) of Porbandar state.[16] Although he only had an elementary education and had previously been a clerk in the state administration, Karamchand proved a capable chief minister.[17] During his tenure, Karamchand married four times. His first two wives died young, after each had given birth to a daughter, and his third marriage was childless. In 1857, Karamchand sought his third wife's permission to remarry; that year, he married Putlibai (1844–1891), who also came from Junagadh,[18] and was from a Pranami Vaishnava family.[19][20][21][22] Karamchand and Putlibai had three children over the ensuing decad

# _____________________________________________________________________________________________________

# 9/ Document Exploration :

In [37]:
def get_text_from_topic(topic, df_corpus, top) :
    df_corpus_output = df_corpus[df_corpus["topic"]==topic].sort_values(by=["topic_proba"], ascending=False).head(top)
    return df_corpus_output

In [39]:
get_text_from_topic('Entertainment Industry', df_corpus_test, top=10)

Unnamed: 0,article_nb,corpus,topic,topic_proba
21391,article2_22796,Who Framed Roger Rabbit is a American fantasy...,Entertainment Industry,0.995
23409,article2_28939,Dame Judith Olivia Dench Dench has garnered L...,Entertainment Industry,0.994
20859,article2_21177,Bringing Up Baby is a American screwball come...,Entertainment Industry,0.994
3131,article_7106,California Friends With an ensemble cast star...,Entertainment Industry,0.993
11208,article_25182,Lanford Wilson was an American playwright His...,Entertainment Industry,0.993
27807,article2_41237,Stephen Glenn Martin is an American actor com...,Entertainment Industry,0.993
47809,article2_77876,Pop punk emo Mark Hoppus Tom DeLonge Adam s S...,Entertainment Industry,0.993
24939,article2_34059,Ralph Bakshi is an American animator filmmake...,Entertainment Industry,0.992
11209,article_25184,Diana Wynne Jones was a British novelist poet...,Entertainment Industry,0.992
32669,article2_49896,Julie Deborah Kavner is an American actress B...,Entertainment Industry,0.992
