In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from fuzzywuzzy import process
import pycountry
import pickle

# Additional Data Prep: (format country's name)

In [None]:
df = pd.read_csv("C:/Users/ASUS/Desktop/Thames' Work/Data Science Project 2024/New CSV/combined_output.csv")

## Separate rows where multiple countries/instituion are listed

In [None]:
# แยก row ของเปเปอร์ที่มี country/insitution หลายอัน
def separate_countries(row):
    countries = row['Country'].split(', ')  
    return pd.DataFrame({**row.to_dict(), 'Country': countries}).dropna()

df = pd.concat([separate_countries(row) for _, row in df.iterrows()], ignore_index=True)

def separate_insitution(row):
    countries = row['Institution'].split(', ')  
    return pd.DataFrame({**row.to_dict(), 'Institution': countries}).dropna()

df = pd.concat([separate_countries(row) for _, row in df.iterrows()], ignore_index=True)
df = pd.concat([separate_insitution(row) for _, row in df.iterrows()], ignore_index=True)

df

## Clean Country Names

In [None]:
valid_country_names = [country.name.lower() for country in pycountry.countries]

def clean_country(country_name):
    # Clean the input by stripping extra spaces and converting to lowercase
    country_name = country_name.strip().lower()
    
    #use fuzzy matching to find the closest country name from the valid list
    if country_name:
        match = process.extractOne(country_name, valid_country_names)
        if match and match[1] > 80:  # Only accept matches with a score higher than 80 (adjust threshold as needed)
            return match[0].title()  # Return the standardized country name with proper capitalization
    return "Unknown"  

df['Country'] = df['Country'].apply(clean_country)
df[df['Country'] != "Unknown"]

df

# LDA

In [None]:
documents = df['Keywords'].dropna().tolist()
vectorizer = CountVectorizer(stop_words='english')
doc_term_matrix = vectorizer.fit_transform(documents)

In [None]:
##Grid Search to find the most coherence amount of topics
def compute_coherence(lda_model, vectorizer):
    feature_names = np.array(vectorizer.get_feature_names_out())
    top_words = [topic.argsort()[-10:][::-1] for topic in lda_model.components_]
    coherence = sum(
        cosine_similarity([lda_model.components_[i]], [lda_model.components_[j]])[0][0]
        for i in range(len(top_words)) for j in range(i + 1, len(top_words))
    )
    return coherence

topics_range = range(2, 11)
coherence_scores = []

for n_topics in topics_range:
    lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
    lda.fit(doc_term_matrix)
    coherence_scores.append(compute_coherence(lda, vectorizer))
    print(f"n_topics={n_topics}, Coherence={coherence_scores[-1]:.3f}")

In [None]:
# Optimal n_topics
optimal_topics = topics_range[np.argmax(coherence_scores)]
lda = LatentDirichletAllocation(n_components=optimal_topics, random_state=42)
lda.fit(doc_term_matrix)

## Saving the LDA model

In [None]:
file_path = "C:/Users/ASUS/Desktop/Thames' Work/Data Science Project 2024/Saved Models/lda_model.pkl"

with open(file_path, 'wb') as file:
    pickle.dump(lda, file)
    
print(f"LDA model saved to {file_path}")

In [None]:
topic_names = [
    " ".join([vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-5:][::-1]])
    for topic in lda.components_
]
topic_names

In [None]:
# Document-topic matrix
doc_topic_matrix = lda.transform(doc_term_matrix)
topic_df = pd.DataFrame(doc_topic_matrix, columns=topic_names)
final_df = pd.concat([df[['Institution', 'Country']], topic_df], axis=1)

## Result of LDA:

In [None]:
final_df

# Clustering Insitution Using K Mean

In [None]:
n_clusters = optimal_topics
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
final_df['Cluster'] = kmeans.fit_predict(doc_topic_matrix)

## Saving the K Mean model

In [None]:
file_path = "C:/Users/ASUS/Desktop/Thames' Work/Data Science Project 2024/Saved Models/kmeans_model.pkl"

with open(file_path, 'wb') as file:
    pickle.dump(kmeans, file)
    
print(f"KMeans model saved to {file_path}")

In [None]:

#assign cluster name
cluster_names = {}

for cluster in range(n_clusters):
    cluster_docs = final_df[final_df['Cluster'] == cluster].iloc[:, 2:2 + len(topic_names)]  # Adjust column selection

    avg_topic_distribution = cluster_docs.mean(axis=0)
    top_topic_indices = avg_topic_distribution.argsort()[-3:][::-1]  # Top 3 topics for cluster
    top_topic_names = [topic_names[i] for i in top_topic_indices if i < len(topic_names)]
    cluster_names[cluster] = ", ".join(top_topic_names)


final_df['Cluster Name'] = final_df['Cluster'].apply(lambda x: f"Cluster {x}: {cluster_names.get(x, 'Unknown')}")

## Result of Clustering:

In [None]:
final_df

# Specialization of Each Country

In [None]:
# Group by Institution, Cluster Name, and Country, then count occurrences
country_cluster_counts = final_df.groupby(['Institution', 'Cluster Name', 'Country']).size().reset_index(name='count')

# Get the most frequent cluster for each Institution
most_frequent_clusters = country_cluster_counts.loc[country_cluster_counts.groupby('Institution')['count'].idxmax()].reset_index(drop=True)

## Result of Specialization:

In [None]:
most_frequent_clusters