In [1]:
import pandas as pd
import numpy as np
import matplotlib
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from gensim import corpora, models


In [2]:
nltk.download("stopwords")
stopwords = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [4]:
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
df = pd.read_csv("My Library sorted 2000 use.csv")

In [6]:
df

Unnamed: 0,Key,Item Type,Publication Year,Author,Title,Publication Title,ISBN,ISSN,DOI,Url,...,Programming Language,Version,System,Code,Code Number,Section,Session,Committee,History,Legislative Body
0,9RFNUFEF,journalArticle,2023,"Savić Tot, Tijana; Adžić, Slobodan; Tot, Vilmo...",The impact of time devoted to video games on s...,Education and Information Technologies,,1573-7608,10.1007/s10639-022-11418-5,https://doi.org/10.1007/s10639-022-11418-5,...,,,,,,,,,,
1,WFXADHWQ,bookSection,2023,"Ahmad, Ala’eddin M.; Abuhashesh, Mohammad; Nus...",Environmental Forces Influencing Perceived Acc...,The Effect of Information Technology on Busine...,978-3-031-12382-5,,,https://doi.org/10.1007/978-3-031-12382-5_129,...,,,,,,,,,,
2,Y3QMD4BL,journalArticle,2023,"St-Onge, Cédric; Kara, Nadjia; Edstrom, Claes",Multivariate outlier filtering for A-NFVLearn:...,The Journal of Supercomputing,,1573-0484,10.1007/s11227-023-05283-3,https://doi.org/10.1007/s11227-023-05283-3,...,,,,,,,,,,
3,MTHPMMNG,bookSection,2023,"Wong, W. Eric; Gao, Ruizhi; Li, Yihao; Abreu, ...",Software Fault Localization: an Overview of Re...,Handbook of Software Fault Localization,978-1-119-88092-9,,,https://onlinelibrary.wiley.com/doi/10.1002/97...,...,,,,,,,,,,
4,49XI7BNG,bookSection,2023,"Megdadi, Younes; Hammouri, Mohammad; Megdadi, ...",The Impact of Facebook Advertisements on Custo...,The Effect of Information Technology on Busine...,978-3-031-12382-5,,,https://doi.org/10.1007/978-3-031-12382-5_10,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6050,K94RQ3BW,conferencePaper,2000,"El-Korany, Abeer; Rafea, Ahmed; Baraka, Hoda; ...",A Structured Testing Methodology for Knowledge...,Database and Expert Systems Applications,978-3-540-44469-5,,10.1007/3-540-44469-6_40,,...,,,,,,,,,,
6051,3AFEH9Q9,conferencePaper,2000,"Labiche, Y.; Thévenod-Fosse, P.; Waeselynck, H...",Testing levels for object-oriented software,Proceedings of the 22nd international conferen...,978-1-58113-206-9,,10.1145/337180.337197,https://dl.acm.org/doi/10.1145/337180.337197,...,,,,,,,,,,
6052,UC7DEIRP,conferencePaper,2000,"Mařík, Vladimír; Král, Luboš; Mařík, Radek",Software Testing & Diagnostics: Theory & Practice,SOFSEM 2000: Theory and Practice of Informatics,978-3-540-44411-4,,10.1007/3-540-44411-4_7,,...,,,,,,,,,,
6053,238I887S,conferencePaper,2000,"Chen, T. Y.; Lau, M. F.",On the Minimal Essential Subsets and Minimal R...,Reliable Software Technologies Ada-Europe 2000,978-3-540-45098-6,,10.1007/10722060_26,,...,,,,,,,,,,


In [7]:
def tokenize_and_lemmatize(text):
    tokens = word_tokenize(text)  # Tokenize sentence into words
    lemmatizer = WordNetLemmatizer()
    lemmas = [lemmatizer.lemmatize(token.lower()) for token in tokens if token.lower() not in stopwords]  # Lemmatize tokens and remove stopwords
    return lemmas


In [8]:
df['lemmas'] = df['Publication Title'].apply(tokenize_and_lemmatize)

In [9]:
df['lemmas']

0                    [education, information, technology]
1       [effect, information, technology, business, ma...
2                               [journal, supercomputing]
3               [handbook, software, fault, localization]
4       [effect, information, technology, business, ma...
                              ...                        
6050              [database, expert, system, application]
6051    [proceeding, 22nd, international, conference, ...
6052     [sofsem, 2000, :, theory, practice, informatics]
6053    [reliable, software, technology, ada-europe, 2...
6054    [proceeding, ., 34th, international, conferenc...
Name: lemmas, Length: 6055, dtype: object

In [10]:
df['lemmas_str'] = df['lemmas'].apply(lambda lemmas: ' '.join(lemmas))

In [11]:
# converting lemma to string---
df['lemmas_str']

0                        education information technology
1       effect information technology business marketi...
2                                  journal supercomputing
3                    handbook software fault localization
4       effect information technology business marketi...
                              ...                        
6050                   database expert system application
6051    proceeding 22nd international conference softw...
6052            sofsem 2000 : theory practice informatics
6053         reliable software technology ada-europe 2000
6054    proceeding . 34th international conference tec...
Name: lemmas_str, Length: 6055, dtype: object

In [17]:
dictionary = corpora.Dictionary(df['lemmas'])

In [20]:
corpus = [dictionary.doc2bow(doc) for doc in df['lemmas']]

In [21]:
# Train the LDA model
lda_model = models.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=10)


In [23]:
# Print the topics
topics = lda_model.show_topics()

In [24]:
topics

[(0,
  '0.100*"software" + 0.058*"system" + 0.050*"technology" + 0.048*"information" + 0.035*":" + 0.035*"product" + 0.029*"experience" + 0.027*"process" + 0.022*"practice" + 0.020*"architecture"'),
 (1,
  '0.118*"software" + 0.110*"engineering" + 0.093*"international" + 0.081*"proceeding" + 0.067*"conference" + 0.030*"acm" + 0.028*"ieee" + 0.028*"symposium" + 0.024*"workshop" + 0.018*"("'),
 (2,
  '0.135*"," + 0.051*"2000" + 0.039*":" + 0.037*"testing" + 0.034*"system" + 0.026*"." + 0.025*"communicating" + 0.020*"conference" + 0.018*"method" + 0.017*"formal"'),
 (3,
  '0.041*"conference" + 0.041*"." + 0.032*")" + 0.032*"(" + 0.029*"system" + 0.026*"computing" + 0.023*"language" + 0.021*"2000" + 0.021*"international" + 0.019*"intelligent"'),
 (4,
  '0.125*"software" + 0.069*":" + 0.069*"," + 0.066*"testing" + 0.050*"journal" + 0.044*"reliability" + 0.038*"verification" + 0.037*"research" + 0.036*"quality" + 0.033*"practice"')]

In [25]:
corpus

[[(0, 1), (1, 1), (2, 1)],
 [(1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)],
 [(8, 1), (9, 1)],
 [(10, 1), (11, 1), (12, 1), (13, 1)],
 [(1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)],
 [(13, 1), (14, 1), (15, 1)],
 [(13, 1), (14, 1), (15, 1)],
 [(13, 1), (14, 1), (15, 1)],
 [(16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1)],
 [(13, 1), (22, 1), (23, 1), (24, 1)],
 [(25, 1), (26, 1), (27, 1)],
 [(5, 1), (28, 1)],
 [(29, 1), (30, 1)],
 [(7, 1),
  (31, 2),
  (32, 1),
  (33, 1),
  (34, 1),
  (35, 1),
  (36, 1),
  (37, 1),
  (38, 1)],
 [(39, 1), (40, 1)],
 [(1, 1), (8, 1), (41, 1), (42, 1), (43, 1), (44, 1)],
 [(29, 1), (45, 1)],
 [(0, 1), (1, 1), (2, 1)],
 [(13, 1), (22, 1), (43, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1)],
 [(13, 1), (22, 1), (24, 1), (51, 1), (52, 1)],
 [(8, 1), (53, 1), (54, 1)],
 [(8, 1), (55, 1), (56, 1)],
 [(13, 1), (31, 1), (57, 1), (58, 1), (59, 1)],
 [(13, 1), (22, 1), (60, 1)],
 [(1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)],