In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.preprocessing import normalize

In [2]:
# Load the dataset
df = pd.read_excel('/content/cleaned_combined_Abstract.xlsx')

In [20]:
# Step 1: Compute TF (term frequency) for each researcher
vectorizer_tf = CountVectorizer()
tf_matrix = vectorizer_tf.fit_transform(df['Cleaned_Abstract'])
tf_df = pd.DataFrame(tf_matrix.toarray(), columns=vectorizer_tf.get_feature_names_out(), index=df['Researcher'])

In [21]:
# Step 2: Compute IDF using all combined abstracts
transformer_idf = TfidfTransformer(use_idf=True, smooth_idf=True, sublinear_tf=False)
transformer_idf.fit(tf_matrix)  # fit on the full corpus
idf_values = transformer_idf.idf_
idf_dict = dict(zip(vectorizer_tf.get_feature_names_out(), idf_values))
idf_df = pd.DataFrame([idf_dict])

In [22]:
# Step 3: Compute TF-IDF by multiplying TF × IDF manually
tf_idf_df = tf_df.copy()
for word in tf_df.columns:
    tf_idf_df[word] = tf_df[word] * idf_dict[word]

In [23]:
tf_idf_df_normalized = pd.DataFrame(normalize(tf_idf_df, norm='l2'),
                                    columns=tf_idf_df.columns,
                                    index=tf_idf_df.index)

In [24]:
# Save outputs
tf_df.to_excel('TF_by_Researcher.xlsx')
idf_df.to_excel('IDF_Global.xlsx', index=False)
tf_idf_df.to_excel('TF_IDF_by_Researcher.xlsx')
tf_idf_df_normalized.to_excel('TF_IDF_Normalized.xlsx')

In [25]:
# Define how many top keywords you want per researcher
TOP_N = 10

In [26]:
# Extract top N TF-IDF keywords for each researcher
top_keywords_per_researcher = {}

In [27]:
for researcher in tf_idf_df.index:
    tfidf_scores = tf_idf_df.loc[researcher]
    top_keywords = tfidf_scores.sort_values(ascending=False).head(TOP_N).index.tolist()
    top_keywords_per_researcher[researcher] = top_keywords

In [28]:
# Convert to DataFrame for readability
keywords_df = pd.DataFrame([
    {'Researcher': researcher, 'Top_Keywords': ', '.join(keywords)}
    for researcher, keywords in top_keywords_per_researcher.items()
])

In [29]:
# Save to Excel
keywords_df.to_excel('Researcher_Top_Keywords.xlsx', index=False)

In [30]:
print(keywords_df.head())

            Researcher                                       Top_Keywords
0          Raphael Shu  dialogue, style, translation, temporal, model,...
1        Cunliang Kong  definitions, definition, chinese, task, comple...
2  Diane M. Napolitano  writing, texts, skills, english, complexity, i...
3        Yi-Ling Chung  counter, hate, online, content, disinformation...
4             Zheng Xu  training, adversarial, networks, gradient, adm...


In [31]:
from sklearn.metrics.pairwise import cosine_similarity

In [32]:
cosine_sim_matrix = cosine_similarity(tf_idf_df)

In [33]:
similarity_df = pd.DataFrame(cosine_sim_matrix,
                             index=tf_idf_df.index,
                             columns=tf_idf_df.index)

In [34]:
similarity_df.to_excel('Researcher_Cosine_Similarity.xlsx')

In [35]:
print(similarity_df.head())


Researcher           Raphael Shu  Cunliang Kong  Diane M. Napolitano  \
Researcher                                                             
Raphael Shu             1.000000       0.267735             0.129286   
Cunliang Kong           0.267735       1.000000             0.227437   
Diane M. Napolitano     0.129286       0.227437             1.000000   
Yi-Ling Chung           0.175189       0.225041             0.131473   
Zheng Xu                0.246032       0.136821             0.098251   

Researcher           Yi-Ling Chung  Zheng Xu  Artur Kulmizev  Vikas Bahirwani  \
Researcher                                                                      
Raphael Shu               0.175189  0.246032        0.227490         0.158493   
Cunliang Kong             0.225041  0.136821        0.249330         0.113072   
Diane M. Napolitano       0.131473  0.098251        0.144030         0.114196   
Yi-Ling Chung             1.000000  0.113476        0.176268         0.091450   
Zheng Xu 