In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from fuzzywuzzy import process
import pycountry
import pickle



# Additional Data Prep: (format country's name)

In [2]:
file_path = input("Enter training data: (E.g. data.csv)\n").strip('"')
file_path = file_path.replace("\\", "/")

df = pd.read_csv(file_path)

## Separate rows where multiple countries/instituion are listed

In [3]:
# แยก row ของเปเปอร์ที่มี country/insitution หลายอัน
def separate_countries(row):
    countries = row['Country'].split(', ')  
    return pd.DataFrame({**row.to_dict(), 'Country': countries}).dropna()

df = pd.concat([separate_countries(row) for _, row in df.iterrows()], ignore_index=True)

def separate_insitution(row):
    countries = row['Institution'].split(', ')  
    return pd.DataFrame({**row.to_dict(), 'Institution': countries}).dropna()

df = pd.concat([separate_countries(row) for _, row in df.iterrows()], ignore_index=True)
df = pd.concat([separate_insitution(row) for _, row in df.iterrows()], ignore_index=True)

df

Unnamed: 0,Title,Authors,Institution,City,Country,Keywords,Date
0,Fast Sampling of Synthetic Apertures in Short ...,"['Jonas Schorlemer', 'Aman Batra', 'Thomas Kai...",['Ruhr University Bochum',"['Dresden', 'Bochum', 'Duisburg']",['Germany'],"['Random Sampling', 'Radar Sensor', 'Range Com...",2023-01-09
1,Fast Sampling of Synthetic Apertures in Short ...,"['Jonas Schorlemer', 'Aman Batra', 'Thomas Kai...",'Technische Universität Dresden',"['Dresden', 'Bochum', 'Duisburg']",['Germany'],"['Random Sampling', 'Radar Sensor', 'Range Com...",2023-01-09
2,Fast Sampling of Synthetic Apertures in Short ...,"['Jonas Schorlemer', 'Aman Batra', 'Thomas Kai...",'University of Duisburg-Essen'],"['Dresden', 'Bochum', 'Duisburg']",['Germany'],"['Random Sampling', 'Radar Sensor', 'Range Com...",2023-01-09
3,A Compact SPDT Switch Amplifier with High ON/O...,"['Tatsuo Kubo', 'Yo Yamaguchi', 'Yuma Okuyama'...",['Sakura-shi'],['Chiba'],['Japan'],"['Variable Gain', 'Matching Circuit', 'Interme...",2023-01-09
4,Single-Snapshot Localization for Near-Field RI...,"['Omar Rinchi', 'Ahmed Elzanaty', 'Ahmad Alsha...",['University of Surrey',"['MO', 'Guildford']",['United Kingdom',"['Localization Error', 'Wireless communication...",2023-01-11
...,...,...,...,...,...,...,...
2473,A Precoding Method to Reduce Self-interference...,"['Jingwen Wang', 'Yongwei Zhang', 'Xianyang Lv...",['University College London',"['Nantong', 'London']",['UK',"['Phased Array', 'Time Difference', 'Signal to...",2024-10-04
2474,A Precoding Method to Reduce Self-interference...,"['Jingwen Wang', 'Yongwei Zhang', 'Xianyang Lv...",'Nantong University'],"['Nantong', 'London']",['UK',"['Phased Array', 'Time Difference', 'Signal to...",2024-10-04
2475,A Precoding Method to Reduce Self-interference...,"['Jingwen Wang', 'Yongwei Zhang', 'Xianyang Lv...",['University College London',"['Nantong', 'London']",'China'],"['Phased Array', 'Time Difference', 'Signal to...",2024-10-04
2476,A Precoding Method to Reduce Self-interference...,"['Jingwen Wang', 'Yongwei Zhang', 'Xianyang Lv...",'Nantong University'],"['Nantong', 'London']",'China'],"['Phased Array', 'Time Difference', 'Signal to...",2024-10-04


## Clean Country Names

In [4]:
valid_country_names = [country.name.lower() for country in pycountry.countries]

def clean_country(country_name):
    # Clean the input by stripping extra spaces and converting to lowercase
    country_name = country_name.strip().lower()
    
    #use fuzzy matching to find the closest country name from the valid list
    if country_name:
        match = process.extractOne(country_name, valid_country_names)
        if match and match[1] > 80:  # Only accept matches with a score higher than 80 (adjust threshold as needed)
            return match[0].title()  # Return the standardized country name with proper capitalization
    return "Unknown"  

df['Country'] = df['Country'].apply(clean_country)
df[df['Country'] != "Unknown"]

df

Unnamed: 0,Title,Authors,Institution,City,Country,Keywords,Date
0,Fast Sampling of Synthetic Apertures in Short ...,"['Jonas Schorlemer', 'Aman Batra', 'Thomas Kai...",['Ruhr University Bochum',"['Dresden', 'Bochum', 'Duisburg']",Germany,"['Random Sampling', 'Radar Sensor', 'Range Com...",2023-01-09
1,Fast Sampling of Synthetic Apertures in Short ...,"['Jonas Schorlemer', 'Aman Batra', 'Thomas Kai...",'Technische Universität Dresden',"['Dresden', 'Bochum', 'Duisburg']",Germany,"['Random Sampling', 'Radar Sensor', 'Range Com...",2023-01-09
2,Fast Sampling of Synthetic Apertures in Short ...,"['Jonas Schorlemer', 'Aman Batra', 'Thomas Kai...",'University of Duisburg-Essen'],"['Dresden', 'Bochum', 'Duisburg']",Germany,"['Random Sampling', 'Radar Sensor', 'Range Com...",2023-01-09
3,A Compact SPDT Switch Amplifier with High ON/O...,"['Tatsuo Kubo', 'Yo Yamaguchi', 'Yuma Okuyama'...",['Sakura-shi'],['Chiba'],Japan,"['Variable Gain', 'Matching Circuit', 'Interme...",2023-01-09
4,Single-Snapshot Localization for Near-Field RI...,"['Omar Rinchi', 'Ahmed Elzanaty', 'Ahmad Alsha...",['University of Surrey',"['MO', 'Guildford']",United Kingdom,"['Localization Error', 'Wireless communication...",2023-01-11
...,...,...,...,...,...,...,...
2473,A Precoding Method to Reduce Self-interference...,"['Jingwen Wang', 'Yongwei Zhang', 'Xianyang Lv...",['University College London',"['Nantong', 'London']",Ukraine,"['Phased Array', 'Time Difference', 'Signal to...",2024-10-04
2474,A Precoding Method to Reduce Self-interference...,"['Jingwen Wang', 'Yongwei Zhang', 'Xianyang Lv...",'Nantong University'],"['Nantong', 'London']",Ukraine,"['Phased Array', 'Time Difference', 'Signal to...",2024-10-04
2475,A Precoding Method to Reduce Self-interference...,"['Jingwen Wang', 'Yongwei Zhang', 'Xianyang Lv...",['University College London',"['Nantong', 'London']",China,"['Phased Array', 'Time Difference', 'Signal to...",2024-10-04
2476,A Precoding Method to Reduce Self-interference...,"['Jingwen Wang', 'Yongwei Zhang', 'Xianyang Lv...",'Nantong University'],"['Nantong', 'London']",China,"['Phased Array', 'Time Difference', 'Signal to...",2024-10-04


# LDA

In [5]:
documents = df['Keywords'].dropna().tolist()
vectorizer = CountVectorizer(stop_words='english')
doc_term_matrix = vectorizer.fit_transform(documents)

In [6]:
##Grid Search to find the most coherence amount of topics
def compute_coherence(lda_model, vectorizer):
    feature_names = np.array(vectorizer.get_feature_names_out())
    top_words = [topic.argsort()[-10:][::-1] for topic in lda_model.components_]
    coherence = sum(
        cosine_similarity([lda_model.components_[i]], [lda_model.components_[j]])[0][0]
        for i in range(len(top_words)) for j in range(i + 1, len(top_words))
    )
    return coherence

topics_range = range(2, 11)
coherence_scores = []

for n_topics in topics_range:
    lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
    lda.fit(doc_term_matrix)
    coherence_scores.append(compute_coherence(lda, vectorizer))
    print(f"n_topics={n_topics}, Coherence={coherence_scores[-1]:.3f}")

n_topics=2, Coherence=0.368
n_topics=3, Coherence=0.975
n_topics=4, Coherence=2.017
n_topics=5, Coherence=3.248
n_topics=6, Coherence=4.759
n_topics=7, Coherence=6.037
n_topics=8, Coherence=7.863
n_topics=9, Coherence=10.116
n_topics=10, Coherence=11.939


In [7]:
# Optimal n_topics
optimal_topics = topics_range[np.argmax(coherence_scores)]
lda = LatentDirichletAllocation(n_components=optimal_topics, random_state=42)
lda.fit(doc_term_matrix)

## Saving the LDA model

In [8]:
file_path = "C:/Users/ASUS/Desktop/Thames' Work/Data Science Project 2024/Saved Models/lda_model.pkl"

with open(file_path, 'wb') as file:
    pickle.dump(lda, file)
    
print(f"LDA model saved to {file_path}")

LDA model saved to C:/Users/ASUS/Desktop/Thames' Work/Data Science Project 2024/Saved Models/lda_model.pkl


In [9]:
topic_names = [
    " ".join([vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-5:][::-1]])
    for topic in lda.components_
]
topic_names

['control model power optimization systems',
 'ice radar sea feature surface',
 'data edge computing internet things',
 'magnetic model imaging element finite',
 'learning network neural deep data',
 'power voltage magnetic control electric',
 'learning machine models prediction team',
 'communication signal multiple wireless frequency',
 'energy power wind renewable time',
 'information frequency features boundary image']

In [10]:
# Document-topic matrix
doc_topic_matrix = lda.transform(doc_term_matrix)
topic_df = pd.DataFrame(doc_topic_matrix, columns=topic_names)
final_df = pd.concat([df[['Institution', 'Country']], topic_df], axis=1)

## Result of LDA:

In [11]:
final_df

Unnamed: 0,Institution,Country,control model power optimization systems,ice radar sea feature surface,data edge computing internet things,magnetic model imaging element finite,learning network neural deep data,power voltage magnetic control electric,learning machine models prediction team,communication signal multiple wireless frequency,energy power wind renewable time,information frequency features boundary image
0,['Ruhr University Bochum',Germany,0.001516,0.654524,0.001516,0.001515,0.001515,0.333352,0.001515,0.001516,0.001515,0.001515
1,'Technische Universität Dresden',Germany,0.001516,0.654524,0.001516,0.001515,0.001515,0.333352,0.001515,0.001516,0.001515,0.001515
2,'University of Duisburg-Essen'],Germany,0.001516,0.654524,0.001516,0.001515,0.001515,0.333352,0.001515,0.001516,0.001515,0.001515
3,['Sakura-shi'],Japan,0.002001,0.002000,0.002000,0.002001,0.002000,0.280515,0.159803,0.399290,0.002000,0.148390
4,['University of Surrey',United Kingdom,0.249092,0.001031,0.001031,0.001031,0.001031,0.001031,0.001031,0.386106,0.001031,0.357584
...,...,...,...,...,...,...,...,...,...,...,...,...
2473,['University College London',Ukraine,0.001389,0.001389,0.001389,0.142174,0.001389,0.001389,0.001389,0.846713,0.001389,0.001389
2474,'Nantong University'],Ukraine,0.001389,0.001389,0.001389,0.142174,0.001389,0.001389,0.001389,0.846713,0.001389,0.001389
2475,['University College London',China,0.001389,0.001389,0.001389,0.142174,0.001389,0.001389,0.001389,0.846713,0.001389,0.001389
2476,'Nantong University'],China,0.001389,0.001389,0.001389,0.142174,0.001389,0.001389,0.001389,0.846713,0.001389,0.001389


# Clustering Insitution Using K Mean

In [12]:
n_clusters = optimal_topics
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
final_df['Cluster'] = kmeans.fit_predict(doc_topic_matrix)

## Saving the K Mean model

In [13]:
file_path = "C:/Users/ASUS/Desktop/Thames' Work/Data Science Project 2024/Saved Models/kmeans_model.pkl"

with open(file_path, 'wb') as file:
    pickle.dump(kmeans, file)
    
print(f"KMeans model saved to {file_path}")

KMeans model saved to C:/Users/ASUS/Desktop/Thames' Work/Data Science Project 2024/Saved Models/kmeans_model.pkl


In [14]:

#assign cluster name
cluster_names = {}

for cluster in range(n_clusters):
    cluster_docs = final_df[final_df['Cluster'] == cluster].iloc[:, 2:2 + len(topic_names)]  # Adjust column selection

    avg_topic_distribution = cluster_docs.mean(axis=0)
    top_topic_indices = avg_topic_distribution.argsort()[-3:][::-1]  # Top 3 topics for cluster
    top_topic_names = [topic_names[i] for i in top_topic_indices if i < len(topic_names)]
    cluster_names[cluster] = ", ".join(top_topic_names)


final_df['Cluster Name'] = final_df['Cluster'].apply(lambda x: f"Cluster {x}: {cluster_names.get(x, 'Unknown')}")

## Result of Clustering:

In [15]:
final_df

Unnamed: 0,Institution,Country,control model power optimization systems,ice radar sea feature surface,data edge computing internet things,magnetic model imaging element finite,learning network neural deep data,power voltage magnetic control electric,learning machine models prediction team,communication signal multiple wireless frequency,energy power wind renewable time,information frequency features boundary image,Cluster,Cluster Name
0,['Ruhr University Bochum',Germany,0.001516,0.654524,0.001516,0.001515,0.001515,0.333352,0.001515,0.001516,0.001515,0.001515,9,"Cluster 9: ice radar sea feature surface, lear..."
1,'Technische Universität Dresden',Germany,0.001516,0.654524,0.001516,0.001515,0.001515,0.333352,0.001515,0.001516,0.001515,0.001515,9,"Cluster 9: ice radar sea feature surface, lear..."
2,'University of Duisburg-Essen'],Germany,0.001516,0.654524,0.001516,0.001515,0.001515,0.333352,0.001515,0.001516,0.001515,0.001515,9,"Cluster 9: ice radar sea feature surface, lear..."
3,['Sakura-shi'],Japan,0.002001,0.002000,0.002000,0.002001,0.002000,0.280515,0.159803,0.399290,0.002000,0.148390,3,Cluster 3: communication signal multiple wirel...
4,['University of Surrey',United Kingdom,0.249092,0.001031,0.001031,0.001031,0.001031,0.001031,0.001031,0.386106,0.001031,0.357584,3,Cluster 3: communication signal multiple wirel...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2473,['University College London',Ukraine,0.001389,0.001389,0.001389,0.142174,0.001389,0.001389,0.001389,0.846713,0.001389,0.001389,3,Cluster 3: communication signal multiple wirel...
2474,'Nantong University'],Ukraine,0.001389,0.001389,0.001389,0.142174,0.001389,0.001389,0.001389,0.846713,0.001389,0.001389,3,Cluster 3: communication signal multiple wirel...
2475,['University College London',China,0.001389,0.001389,0.001389,0.142174,0.001389,0.001389,0.001389,0.846713,0.001389,0.001389,3,Cluster 3: communication signal multiple wirel...
2476,'Nantong University'],China,0.001389,0.001389,0.001389,0.142174,0.001389,0.001389,0.001389,0.846713,0.001389,0.001389,3,Cluster 3: communication signal multiple wirel...


# Specialization of Each Country

In [16]:
# Group by Institution, Cluster Name, and Country, then count occurrences
country_cluster_counts = final_df.groupby(['Institution', 'Cluster Name', 'Country']).size().reset_index(name='count')

# Get the most frequent cluster for each Institution
most_frequent_clusters = country_cluster_counts.loc[country_cluster_counts.groupby('Institution')['count'].idxmax()].reset_index(drop=True)

## Result of Specialization:

In [17]:
most_frequent_clusters

Unnamed: 0,Institution,Cluster Name,Country,count
0,"""Centre Suisse d'Electronique et de Microtechn...",Cluster 0: magnetic model imaging element fini...,Netherlands,1
1,"""School of Electrical Engineering Xi'an Jiaoto...",Cluster 7: control model power optimization sy...,China,1
2,"""Univ. Catholique de l'Ouest""",Cluster 1: data edge computing internet things...,Belgium,1
3,"""Università degli studi ‘G. d'Annunzio’ Chieti...",Cluster 5: power voltage magnetic control elec...,Italy,1
4,"""Xi'an JiaoTong University""]",Cluster 1: data edge computing internet things...,China,1
...,...,...,...,...
1569,['branch of the Tyumen Industrial University',Cluster 5: power voltage magnetic control elec...,Russian Federation,1
1570,['c. Laboratory'],Cluster 3: communication signal multiple wirel...,France,1
1571,['i3S',"Cluster 6: learning network neural deep data, ...",France,1
1572,['jinan'],Cluster 1: data edge computing internet things...,China,1
