In [None]:
from google.colab import files


uploaded = files.upload()

# **Data Pre-Processing**

In [23]:
import pandas as pd
file = pd.read_csv("Google Scholar Resources.csv")

# Ensure 'Citations' column is numeric (in case it was read as string)
file['Citations'] = pd.to_numeric(file['Citations'], errors='coerce')

# Drop rows with missing citation values
file = file.dropna(subset=['Citations'])

# Calculate the 70th percentile (so top 30% is above this)
threshold = file['Citations'].quantile(0.70)

# Filter rows with citation count higher than the threshold
df = file[file['Citations'] > threshold]

# Optional: reset index
df = df.reset_index(drop=True)

In [24]:
df.columns

Index(['Title', 'Abstract', 'Authors', 'Year', 'URL', 'Citations', 'Journal'], dtype='object')

In [25]:
df.head()

Unnamed: 0,Title,Abstract,Authors,Year,URL,Citations,Journal
0,High-performance medicine: the convergence of ...,,E. Topol,2019,https://www.semanticscholar.org/paper/f134abea...,4040,Nature Medicine
1,Explainable Artificial Intelligence (XAI): Con...,,"Alejandro Barredo Arrieta, Natalia D√≠az Rodr√≠g...",2019,https://www.semanticscholar.org/paper/530a059c...,5841,Inf. Fusion
2,Explanation in Artificial Intelligence: Insigh...,,Tim Miller,2017,https://www.semanticscholar.org/paper/e89dfa30...,4081,Artif. Intell.
3,Sparks of Artificial General Intelligence: Ear...,Artificial intelligence (AI) researchers have ...,"S√©bastien Bubeck, Varun Chandrasekaran, Ronen ...",2023,https://www.semanticscholar.org/paper/8dbd5746...,2793,ArXiv
4,Peeking Inside the Black-Box: A Survey on Expl...,At the dawn of the fourth industrial revolutio...,"Amina Adadi, M. Berrada",2018,https://www.semanticscholar.org/paper/21dff47a...,3662,IEEE Access


In [26]:
df.shape

(30630, 7)

In [27]:
# Determine the best text column to use
text_column = "Abstract" if "Abstract" in df.columns else "Title"
text_column

'Abstract'

In [28]:
df["Abstract"] = df["Abstract"].fillna(df["Title"])

# **Text Embedding Using Bert**

In [29]:
from sentence_transformers import SentenceTransformer

# Load Sentence-BERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

In [30]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Load the saved NumPy arrays
import numpy as np

embeddings = np.load("/content/drive/MyDrive/embeddings.npy")
reduced_embeddings = np.load("/content/drive/MyDrive/reduced_embeddings.npy")

#check the shape
print("Embeddings shape:", embeddings.shape)
print("Reduced embeddings shape:", reduced_embeddings.shape)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Embeddings shape: (30630, 384)
Reduced embeddings shape: (30630, 64)


In [None]:
import re

# Preprocess text
def clean_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r'\d+', '', text)  # Remove numbers
        text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
        text = text.strip()  # Remove extra spaces
    return text

# Apply preprocessing
df[text_column] = df[text_column].fillna("").apply(clean_text)

# Generate embeddings efficiently
print("Generating text embeddings...")
embeddings = model.encode(df[text_column].tolist(), batch_size=128, show_progress_bar=True)

In [None]:
embeddings.shape

# **Dimantiality Reduction Using Autoencoder**
We use an autoencoder neural network to perform unsupervised dimensionality reduction and anomaly detection on high-dimensional embeddings.The autoencoder learns to compress each embedding into a lower-dimensional latent representation (via the bottleneck layer) and then reconstruct it back to its original form. By minimizing the reconstruction error during training, the model captures the most important features of the data. Once trained, each document's reconstruction error is computed, and those with the highest errors (the top 5%) are identified as poorly reconstructed samples which may represent outliers, unusual structure, or noise.

The goal is to detect and optionally filter out samples that deviate significantly from the learned structure of the dataset. This improves the quality of downstream tasks in this project inclusing clustering, semantic search, or topic modeling by reducing the influence of noisy or inconsistent data. The benefits include cleaner datasets, better-defined clusters, and the ability to identify edge cases or rare patterns.

In [None]:
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

# Define the autoencoder architecture
input_dim = embeddings.shape[1]  # Input size equals the dimension of embeddings

input_layer = Input(shape=(input_dim,))  # Input layer
encoded = Dense(128, activation='relu')(input_layer)  # First encoding layer
encoded = Dense(64, activation='relu')(encoded)       # Bottleneck layer (compressed representation)
decoded = Dense(128, activation='relu')(encoded)      # First decoding layer
decoded = Dense(input_dim, activation='linear')(decoded)  # Output layer (reconstruct original input)

autoencoder = Model(inputs=input_layer, outputs=decoded)  # Full autoencoder model
encoder = Model(inputs=input_layer, outputs=encoded)      # Encoder model (for dimensionality reduction)

autoencoder.compile(optimizer=Adam(learning_rate=0.001), loss='mse')  # Compile with MSE loss

# Train the autoencoder to reconstruct input embeddings
autoencoder.fit(embeddings, embeddings, epochs=50, batch_size=32, shuffle=True)

# Get reduced (encoded) and reconstructed embeddings
reduced_embeddings = encoder.predict(embeddings)              # Compressed version
reconstructed_embeddings = autoencoder.predict(embeddings)    # Output from autoencoder

# Compute reconstruction error (MSE) for each sample
reconstruction_errors = np.mean((embeddings - reconstructed_embeddings) ** 2, axis=1)

# Identify samples with reconstruction error above the 95th percentile
threshold = np.percentile(reconstruction_errors, 95)  # Top 5% error threshold
poorly_reconstructed_indices = np.where(reconstruction_errors > threshold)[0]  # Indices of poor samples

print(f"\nNumber of poorly reconstructed samples: {len(poorly_reconstructed_indices)}")

# Plot histogram of reconstruction errors
plt.hist(reconstruction_errors, bins=50)
plt.title("Distribution of Reconstruction Errors")
plt.xlabel("MSE Loss")
plt.ylabel("Frequency")
plt.show()


In [31]:
reduced_embeddings.shape

(30630, 64)

# **Find The Best Number Of Components For GMM Based On silhouette Score**

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Use the first two dimensions for visualization
sil_scores = []

# Try different numbers of clusters from 2 to 30
for n in range(2, 31):
    kmeans = KMeans(n_clusters=n, random_state=42, n_init='auto')
    labels = kmeans.fit_predict(reduced_embeddings)
    sil = silhouette_score(reduced_embeddings, labels)
    sil_scores.append(sil)

# Best number of clusters based on silhouette score
components_range = list(range(2, 31))
best_n = components_range[np.argmax(sil_scores)]
print(f"\n‚úÖ Best number of clusters based on silhouette score: {best_n}")

# Visualization
plt.figure(figsize=(10, 6))
plt.plot(components_range, sil_scores, marker='o', linestyle='-', label='Silhouette Score')
plt.axvline(x=best_n, color='r', linestyle='--', label=f'Best n = {best_n}')
plt.title('Silhouette Scores for KMeans with Different Number of Clusters')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.xticks(components_range)
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


# **Keywords Extraction**

This method combines Gaussian Mixture Model clustering with TF-IDF-based keyword extraction to uncover the main themes within a corpus of text data. First, documents are embedded and reduced (using BERT + autoencoder), and then clustered into semantically coherent groups using GMM. After clustering, a TfidfVectorizer is applied to the original text to identify the most representative terms for each cluster. By averaging TF-IDF scores within each group, the top keywords are extracted‚Äîoffering interpretable insights into the semantic focus of each cluster. The goal is to automatically group and summarize documents by topic without manual labeling.

In [43]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

# Fit Gaussian Mixture Model on the reduced embeddings
KMeans = KMeans(n_clusters=best_n, random_state=42)
KMeans_labels = KMeans.fit_predict(reduced_embeddings)

# Assign cluster labels to the DataFrame
df["Cluster"] = KMeans_labels

# Initialize TF-IDF vectorizer (ignoring common English stopwords and limiting features to 2000)
vectorizer = TfidfVectorizer(stop_words='english', max_features=2000)

# Transform the original text data into a TF-IDF matrix
X_tfidf = vectorizer.fit_transform(df[text_column])

# Get the list of feature names (The words in the TF-IDF vocab)
terms = vectorizer.get_feature_names_out()

top_n = 10  # Number of top keywords to extract per cluster
cluster_keywords = {}

# Loop over each cluster to find top keywords
for cluster_num in sorted(df["Cluster"].unique()):
    # Get the indices of documents in the current cluster
    cluster_indices = df[df["Cluster"] == cluster_num].index.to_list()

    # Compute the mean TF-IDF score for each word across all docs in the cluster
    cluster_tfidf = X_tfidf[cluster_indices].mean(axis=0)

    # Convert sparse matrix to a flat NumPy array
    cluster_array = np.squeeze(np.asarray(cluster_tfidf))

    # Get indices of the top n highest scoring words
    top_term_indices = cluster_array.argsort()[::-1][:top_n]

    # Retrieve the actual words corresponding to those indices
    keywords = [terms[i] for i in top_term_indices]

    # Store the keywords for this cluster
    cluster_keywords[cluster_num] = keywords

# Print the top keywords for each cluster
for cluster, keywords in cluster_keywords.items():
    print(f"\nüîπCluster {cluster} ‚Äî Top Keywords:")
    print(", ".join(keywords))



üîπCluster 0 ‚Äî Top Keywords:
iot, wireless, learning, devices, edge, federated, data, communication, fl, energy

üîπCluster 1 ‚Äî Top Keywords:
quantum, materials, machine, learning, clusters, chemical, density, molecular, energy, model

üîπCluster 2 ‚Äî Top Keywords:
speech, audio, recognition, neural, acoustic, model, networks, speaker, recurrent, models

üîπCluster 3 ‚Äî Top Keywords:
image, learning, object, network, convolutional, transformer, attention, vision, supervised, deep

üîπCluster 4 ‚Äî Top Keywords:
learning, data, classification, vector, kernel, support, feature, algorithm, analysis, sparse

üîπCluster 5 ‚Äî Top Keywords:
detection, anomaly, intrusion, learning, network, based, data, anomalies, deep, traffic

üîπCluster 6 ‚Äî Top Keywords:
learning, deep, machine, networks, neural, data, training, supervised, algorithms, models

üîπCluster 7 ‚Äî Top Keywords:
privacy, learning, data, machine, federated, attacks, models, ai, model, training

üîπCluster 8 ‚Äî

In [44]:
# Count how many non-zero TF-IDF terms each cluster has
for cluster_num in sorted(df["Cluster"].unique()):
    cluster_indices = df[df["Cluster"] == cluster_num].index.to_list()
    cluster_tfidf = X_tfidf[cluster_indices].mean(axis=0)
    cluster_array = np.squeeze(np.asarray(cluster_tfidf))

    nonzero_count = np.count_nonzero(cluster_array)
    print(f"üîπ Cluster {cluster_num} has {nonzero_count} non-zero TF-IDF keywords.")


üîπ Cluster 0 has 1759 non-zero TF-IDF keywords.
üîπ Cluster 1 has 1809 non-zero TF-IDF keywords.
üîπ Cluster 2 has 1796 non-zero TF-IDF keywords.
üîπ Cluster 3 has 1949 non-zero TF-IDF keywords.
üîπ Cluster 4 has 1861 non-zero TF-IDF keywords.
üîπ Cluster 5 has 1876 non-zero TF-IDF keywords.
üîπ Cluster 6 has 1945 non-zero TF-IDF keywords.
üîπ Cluster 7 has 1842 non-zero TF-IDF keywords.
üîπ Cluster 8 has 1822 non-zero TF-IDF keywords.
üîπ Cluster 9 has 1907 non-zero TF-IDF keywords.
üîπ Cluster 10 has 1847 non-zero TF-IDF keywords.
üîπ Cluster 11 has 1662 non-zero TF-IDF keywords.
üîπ Cluster 12 has 1819 non-zero TF-IDF keywords.
üîπ Cluster 13 has 1552 non-zero TF-IDF keywords.
üîπ Cluster 14 has 1834 non-zero TF-IDF keywords.
üîπ Cluster 15 has 1783 non-zero TF-IDF keywords.
üîπ Cluster 16 has 1786 non-zero TF-IDF keywords.
üîπ Cluster 17 has 1862 non-zero TF-IDF keywords.
üîπ Cluster 18 has 1868 non-zero TF-IDF keywords.
üîπ Cluster 19 has 1828 non-zero TF-IDF 

# **Implementatiom**

The following method combines semantic embeddings, dimensionality reduction, and hierarchical clustering to retrieve relevant documents based on a user's query. First, each document is encoded using a Sentence-BERT model to capture deep semantic meaning. These high-dimensional embeddings are then compressed using an autoencoder, which reduces noise and preserves core structure. A Gaussian Mixture Model is applied to the reduced embeddings to group documents into high-level semantic clusters. When a user submits a query, it undergoes the same encoding and reduction process, is assigned to a cluster, and is compared to the cluster's documents using cosine similarity. A second layer of GMM clustering is then applied to document titles to refine the results even further, ensuring that the final recommendations are both thematically and topically relevant.



In [41]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from tensorflow.keras.models import load_model


encoder = load_model('/content/drive/MyDrive/encoder_model.h5')
df["Embeddings"] = list(embeddings)  # Assign precomputed embeddings to the DataFrame

# First-level clustering using reduced embeddings from the autoencoder
kmeans_model = KMeans(n_clusters=best_n, n_init='auto', random_state=42)
df["Cluster"] = kmeans_model.fit_predict(reduced_embeddings)  # Assign cluster labels to DataFrame

def search_similar_resources(query, df, top_n=10, min_similarity=0.50, title_cluster_k=3):
    if not query.strip():  # Check for empty query
        print("Query is empty. Please enter valid keywords.")
        return pd.DataFrame()

    # Encode and reduce query using the trained SentenceTransformer and autoencoder
    query_embedding = model.encode([query])  # Convert query to embedding
    query_embedding_reduced = encoder.predict(query_embedding)  # Reduce embedding using autoencoder

    # Predict which cluster the query belongs to using the KMeans
    query_cluster = kmeans_model.predict(query_embedding_reduced)[0]
    cluster_df = df[df["Cluster"] == query_cluster].copy()  # Filter samples in the same cluster

    if cluster_df.empty:
        print("No relevant resources found in the identified cluster.")
        return pd.DataFrame()

    # Compute cosine similarity between query and cluster embeddings
    cluster_embeddings = np.vstack(cluster_df["Embeddings"].values)
    similarity_scores = cosine_similarity(query_embedding, cluster_embeddings).flatten()
    cluster_df["Similarity"] = similarity_scores
    cluster_df = cluster_df[cluster_df["Similarity"] >= min_similarity]

    if cluster_df.empty:
        print("No resources met the minimum similarity threshold.")
        return pd.DataFrame()

    # Title-level clustering: group similar titles within the filtered cluster
    title_embeddings = model.encode(cluster_df["Title"].tolist())
    if len(title_embeddings) < title_cluster_k:
        title_cluster_k = max(1, len(title_embeddings))  # Avoid KMeans crash

    title_kmeans = KMeans(n_clusters=title_cluster_k, random_state=42)
    cluster_df["TitleCluster"] = title_kmeans.fit_predict(title_embeddings)

    # Predict which title sub-cluster the query belongs to
    query_title_embedding = model.encode([query])
    query_title_cluster = title_kmeans.predict(query_title_embedding)[0]
    subcluster_df = cluster_df[cluster_df["TitleCluster"] == query_title_cluster].copy()

    if subcluster_df.empty:
        print("No relevant titles found in the identified subcluster.")
        return pd.DataFrame()

    subcluster_df = subcluster_df.drop_duplicates(subset=["Title", "Similarity"])
    top_results = subcluster_df.sort_values(by="Similarity", ascending=False).head(top_n)
    return top_results[["Title", "URL", "Similarity"]]

# Example usage
query = input("Enter keywords to search: ")
recommendations = search_similar_resources(query, df, top_n=10, min_similarity=0.50, title_cluster_k=3)




Enter keywords to search: pca




[1m1/1[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 98ms/step


# **Print Out The Recommended Results**

In [42]:
from IPython.core.display import display, Markdown

def display_results_as_markdown(results):
    """
    Display search results as Markdown in Jupyter Notebook.
    """
    output = "### üîç Search Results:\n"
    for index, row in recommendations.iterrows():
        output += f"- **Title:** [{row['Title']}]({row['URL']})\n"
        output += f"  - ‚≠ê **Similarity Score:** {row['Similarity']:.2f}\n\n"
    display(Markdown(output))

# Call function
display_results_as_markdown(recommendations)


### üîç Search Results:
- **Title:** [A Tutorial on Principal Component Analysis](https://www.semanticscholar.org/paper/562e7f497eff8363825abad8d0008a42ce00eb49)
  - ‚≠ê **Similarity Score:** 0.67

- **Title:** [Principal Component Analysis](https://www.semanticscholar.org/paper/ae7b96d287893d246313ccd0566cc4a17f863d44)
  - ‚≠ê **Similarity Score:** 0.67

- **Title:** [An overview of principal component analysis](https://www.semanticscholar.org/paper/5939cb4e961dcab19cfb1d529e82dc872e95a694)
  - ‚≠ê **Similarity Score:** 0.66

- **Title:** [On Consistency and Sparsity for Principal Components Analysis in High Dimensions](https://www.semanticscholar.org/paper/6b0345fe5dbf7a8551edd7ae3f56f803fc21378a)
  - ‚≠ê **Similarity Score:** 0.57

- **Title:** [Applying Principal Components Analysis to Event-Related Potentials: A Tutorial](https://www.semanticscholar.org/paper/db7754609cadd6b4453b8a1a4cbe3ee99709dde0)
  - ‚≠ê **Similarity Score:** 0.56

- **Title:** [A Randomized Algorithm for Principal Component Analysis](https://www.semanticscholar.org/paper/08a757bb53efafbb0eec6cd0a9ab3f128b6d01d3)
  - ‚≠ê **Similarity Score:** 0.55

- **Title:** [Finite sample approximation results for principal component analysis: a matrix perturbation approach](https://www.semanticscholar.org/paper/7c0f5b508303c35da77fa775fa08fdfa5f6b37db)
  - ‚≠ê **Similarity Score:** 0.53

- **Title:** [iPCA: An Interactive System for PCA‚Äêbased Visual Analytics](https://www.semanticscholar.org/paper/1b95578b8b50d5de940f2e85a6126f63c6c04ff0)
  - ‚≠ê **Similarity Score:** 0.53

- **Title:** [Independent Vector Analysis: An Extension of ICA to Multivariate Components](https://www.semanticscholar.org/paper/ad26804e1c70d75fd22552f18ba5f84bf592591d)
  - ‚≠ê **Similarity Score:** 0.52

- **Title:** [Sparse Principal Components Analysis](https://www.semanticscholar.org/paper/39ac586c519ef8569b4f401a257c29b2039d5d25)
  - ‚≠ê **Similarity Score:** 0.52



In [36]:
from google.colab import drive
drive.mount('/content/drive')

np.save("/content/drive/MyDrive/embeddings.npy", embeddings)
encoder.save('/content/drive/MyDrive/encoder_model.h5')
np.save("/content/drive/MyDrive/reduced_embeddings.npy", reduced_embeddings)




Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
