In [1]:
import pandas as pd
import numpy as np
import matplotlib
import nltk
from nltk.corpus import stopwords
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
nltk.download("stopwords")
stopwords = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [4]:
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
df = pd.read_csv("My Library sorted 2000 use.csv")

In [6]:
df

Unnamed: 0,Key,Item Type,Publication Year,Author,Title,Publication Title,ISBN,ISSN,DOI,Url,...,Programming Language,Version,System,Code,Code Number,Section,Session,Committee,History,Legislative Body
0,9RFNUFEF,journalArticle,2023,"Savić Tot, Tijana; Adžić, Slobodan; Tot, Vilmo...",The impact of time devoted to video games on s...,Education and Information Technologies,,1573-7608,10.1007/s10639-022-11418-5,https://doi.org/10.1007/s10639-022-11418-5,...,,,,,,,,,,
1,WFXADHWQ,bookSection,2023,"Ahmad, Ala’eddin M.; Abuhashesh, Mohammad; Nus...",Environmental Forces Influencing Perceived Acc...,The Effect of Information Technology on Busine...,978-3-031-12382-5,,,https://doi.org/10.1007/978-3-031-12382-5_129,...,,,,,,,,,,
2,Y3QMD4BL,journalArticle,2023,"St-Onge, Cédric; Kara, Nadjia; Edstrom, Claes",Multivariate outlier filtering for A-NFVLearn:...,The Journal of Supercomputing,,1573-0484,10.1007/s11227-023-05283-3,https://doi.org/10.1007/s11227-023-05283-3,...,,,,,,,,,,
3,MTHPMMNG,bookSection,2023,"Wong, W. Eric; Gao, Ruizhi; Li, Yihao; Abreu, ...",Software Fault Localization: an Overview of Re...,Handbook of Software Fault Localization,978-1-119-88092-9,,,https://onlinelibrary.wiley.com/doi/10.1002/97...,...,,,,,,,,,,
4,49XI7BNG,bookSection,2023,"Megdadi, Younes; Hammouri, Mohammad; Megdadi, ...",The Impact of Facebook Advertisements on Custo...,The Effect of Information Technology on Busine...,978-3-031-12382-5,,,https://doi.org/10.1007/978-3-031-12382-5_10,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6050,K94RQ3BW,conferencePaper,2000,"El-Korany, Abeer; Rafea, Ahmed; Baraka, Hoda; ...",A Structured Testing Methodology for Knowledge...,Database and Expert Systems Applications,978-3-540-44469-5,,10.1007/3-540-44469-6_40,,...,,,,,,,,,,
6051,3AFEH9Q9,conferencePaper,2000,"Labiche, Y.; Thévenod-Fosse, P.; Waeselynck, H...",Testing levels for object-oriented software,Proceedings of the 22nd international conferen...,978-1-58113-206-9,,10.1145/337180.337197,https://dl.acm.org/doi/10.1145/337180.337197,...,,,,,,,,,,
6052,UC7DEIRP,conferencePaper,2000,"Mařík, Vladimír; Král, Luboš; Mařík, Radek",Software Testing & Diagnostics: Theory & Practice,SOFSEM 2000: Theory and Practice of Informatics,978-3-540-44411-4,,10.1007/3-540-44411-4_7,,...,,,,,,,,,,
6053,238I887S,conferencePaper,2000,"Chen, T. Y.; Lau, M. F.",On the Minimal Essential Subsets and Minimal R...,Reliable Software Technologies Ada-Europe 2000,978-3-540-45098-6,,10.1007/10722060_26,,...,,,,,,,,,,


In [7]:
def tokenize_and_lemmatize(text):
    tokens = word_tokenize(text)  # Tokenize sentence into words
    lemmatizer = WordNetLemmatizer()
    lemmas = [lemmatizer.lemmatize(token.lower()) for token in tokens if token.lower() not in stopwords]  # Lemmatize tokens and remove stopwords
    return lemmas


In [8]:
df['lemmas'] = df['Publication Title'].apply(tokenize_and_lemmatize)

In [9]:
df['lemmas']

0                    [education, information, technology]
1       [effect, information, technology, business, ma...
2                               [journal, supercomputing]
3               [handbook, software, fault, localization]
4       [effect, information, technology, business, ma...
                              ...                        
6050              [database, expert, system, application]
6051    [proceeding, 22nd, international, conference, ...
6052     [sofsem, 2000, :, theory, practice, informatics]
6053    [reliable, software, technology, ada-europe, 2...
6054    [proceeding, ., 34th, international, conferenc...
Name: lemmas, Length: 6055, dtype: object

In [10]:
df['lemmas_str'] = df['lemmas'].apply(lambda lemmas: ' '.join(lemmas))

In [11]:
# converting lemma to string---
df['lemmas_str']

0                        education information technology
1       effect information technology business marketi...
2                                  journal supercomputing
3                    handbook software fault localization
4       effect information technology business marketi...
                              ...                        
6050                   database expert system application
6051    proceeding 22nd international conference softw...
6052            sofsem 2000 : theory practice informatics
6053         reliable software technology ada-europe 2000
6054    proceeding . 34th international conference tec...
Name: lemmas_str, Length: 6055, dtype: object

In [12]:
# Create document-term matrix
vectorizer = CountVectorizer()
dtm = vectorizer.fit_transform(df['lemmas_str'])
dtm

<6055x2171 sparse matrix of type '<class 'numpy.int64'>'
	with 34689 stored elements in Compressed Sparse Row format>

In [13]:
# Apply LDA
lda = LatentDirichletAllocation(n_components=3)  # Specify the desired number of topics
lda.fit(dtm)

In [14]:
# Get the topic distributions for each document
topic_distributions = lda.transform(dtm)
df['topic_distribution'] = list(topic_distributions)
# Determine dominant topic for each document
dominant_topics = topic_distributions.argmax(axis=1)


In [15]:
# Add dominant topic to DataFrame
df['dominant_topic'] = dominant_topics

# Display the DataFrame with the original sentences, lemmas, and dominant topics
print(df[['Publication Title', 'lemmas', 'dominant_topic']])

                                      Publication Title  \
0                Education and Information Technologies   
1     The Effect of Information Technology on Busine...   
2                         The Journal of Supercomputing   
3               Handbook of Software Fault Localization   
4     The Effect of Information Technology on Busine...   
...                                                 ...   
6050           Database and Expert Systems Applications   
6051  Proceedings of the 22nd international conferen...   
6052    SOFSEM 2000: Theory and Practice of Informatics   
6053     Reliable Software Technologies Ada-Europe 2000   
6054  Proceedings. 34th International Conference on ...   

                                                 lemmas  dominant_topic  
0                  [education, information, technology]               1  
1     [effect, information, technology, business, ma...               2  
2                             [journal, supercomputing]              

In [16]:
print(df[['Publication Title', 'lemmas', 'topic_distribution']])

                                      Publication Title  \
0                Education and Information Technologies   
1     The Effect of Information Technology on Busine...   
2                         The Journal of Supercomputing   
3               Handbook of Software Fault Localization   
4     The Effect of Information Technology on Busine...   
...                                                 ...   
6050           Database and Expert Systems Applications   
6051  Proceedings of the 22nd international conferen...   
6052    SOFSEM 2000: Theory and Practice of Informatics   
6053     Reliable Software Technologies Ada-Europe 2000   
6054  Proceedings. 34th International Conference on ...   

                                                 lemmas  \
0                  [education, information, technology]   
1     [effect, information, technology, business, ma...   
2                             [journal, supercomputing]   
3             [handbook, software, fault, localization]