<a href="https://colab.research.google.com/github/TommasoG85/Collective-intelligence/blob/main/Collective_intelligence_ArXiv_1_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#!pip install bertopic

import requests
import xml.etree.ElementTree as ET
import pandas as pd
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
import numpy as np
import umap
import plotly.express as px
import random
from hdbscan import HDBSCAN
from datetime import datetime
from collections import defaultdict
from google.colab import files

# Set a random seed for reproducibility
random.seed(101)
np.random.seed(101)

# Download the NLTK stopwords
nltk.download('stopwords')

# Get the English stopwords from NLTK
stopwords_nltk = stopwords.words('english')

# Step 1: Fetch Data from arXiv API
def fetch_arxiv_abstracts(query="collective intelligence, social brain, group intelligence, crowd intelligence, social intelligence, collective rationality, complex adaptive systems, superorganism",max_results=50000):
    url = 'http://export.arxiv.org/api/query'
    params = {
        'search_query': f'all:"{query}"',
        'start': 0,
        'max_results': max_results,
        'sortBy': 'relevance',
        "sortOrder": "descending",  # Newest first
    }
    response = requests.get(url, params=params)
    data = response.text

    # Parse XML response
    root = ET.fromstring(data)
    records = []
    for entry in root.findall("{http://www.w3.org/2005/Atom}entry"):
        title = entry.find("{http://www.w3.org/2005/Atom}title").text
        summary = entry.find("{http://www.w3.org/2005/Atom}summary").text
        published = entry.find("{http://www.w3.org/2005/Atom}published").text
        authors = [author.find("{http://www.w3.org/2005/Atom}name").text for author in entry.findall("{http://www.w3.org/2005/Atom}author")]
        url = entry.find("{http://www.w3.org/2005/Atom}id").text

        # Convert 'published' to a datetime object if needed
        published_date = datetime.strptime(published, "%Y-%m-%dT%H:%M:%SZ").date()

        records.append({
            "title": title,
            "authors": ", ".join(authors),
            "abstract": summary,
            "published_date": published,
            "arxiv_url": url
        })

    return pd.DataFrame(records)

# Fetch abstracts with multiple search terms
df = fetch_arxiv_abstracts(query="collective intelligence",max_results=10000)
print("Data fetched from arXiv.")

# Save abstracts to a CSV file (optional)
df.to_csv("arxiv_collective_intelligence.csv", index=False)

# Step 2: Clean the Abstracts by Removing Stopwords
def remove_stopwords(text, stopwords_list):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stopwords_list]
    return " ".join(filtered_words)

# Clean the abstracts
df['cleaned_abstract'] = df['abstract'].apply(lambda x: remove_stopwords(x, stopwords_nltk))

# Step 3: Use BERT for Embedding
# Load a pre-trained SentenceTransformer model for BERT embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for the cleaned abstracts
abstracts = df['cleaned_abstract'].tolist()
embeddings = model.encode(abstracts, show_progress_bar=True)
timestamps = df['published_date'].tolist()
print("BERT embeddings generated.")

# Step 4: Topic Modeling with BERTopic
# Initialize a custom CountVectorizer with stopwords
vectorizer_model = CountVectorizer(stop_words=stopwords_nltk)

# Initialize the clustering model (HDBSCAN) with a fixed random state
# The 'random_state' parameter should be removed from KDTree
hdbscan_model = HDBSCAN(
    min_cluster_size=4, # Controls the minimum number of points to form a cluster (increase to reduce outliers)
    min_samples=5, # Controls the minimum number of points to form a cluster (increase to reduce outliers)
    metric='euclidean',
    cluster_selection_method='eom',
    prediction_data=True
    #random_state=222  # This is fine here
)

# Initialize BERTopic with the custom vectorizer
topic_model = BERTopic(
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    min_topic_size=5,
    verbose=True,
    nr_topics=500,
    top_n_words=5
)

# Fit the model to the cleaned abstracts and embeddings
topics, probabilities = topic_model.fit_transform(abstracts, embeddings)

# Obtain topics over time
topics_over_time = topic_model.topics_over_time(abstracts, timestamps)

# Ensure 'Timestamp' is in datetime format (if it's not already)
topics_over_time['Timestamp'] = pd.to_datetime(topics_over_time['Timestamp'])

# Group by month
topics_over_time['Month'] = topics_over_time['Timestamp'].dt.to_period('M')

# Count the frequency of each topic per month
topics_over_time['Frequency'] = topics_over_time.groupby(['Month', 'Topic'])['Topic'].transform('count')

# Now print to check the frequencies
print(topics_over_time[['Timestamp', 'Topic', 'Frequency']].head())



# Display the assigned topics to check for issues
print("Assigned topics:", topics)
print("Unique topics:", set(topics))

# Step 5: Visualize Topics

# 1. Save the Topic Visualization as HTML
topic_plot = topic_model.visualize_topics()

# 2. Save the Hierarchy Visualization as HTML
hierarchy_plot = topic_model.visualize_hierarchy()

# 3. Save the Heatmap Visualization as HTML
heatmap_plot = topic_model.visualize_heatmap()

# 4. Save the Bar Chart Visualization as HTML
barchart_plot = topic_model.visualize_barchart()

# Step 1: Reduce dimensionality to 2D using UMAP
umap_model = umap.UMAP(n_neighbors=30, min_dist=0.1, metric='cosine', n_components=2)
umap_embeddings = umap_model.fit_transform(embeddings)

# Step 2: Create a DataFrame with UMAP coordinates, topic labels, and topic names
topics_df = pd.DataFrame({
    'UMAP_1': umap_embeddings[:, 0],
    'UMAP_2': umap_embeddings[:, 1],
    'Topic': topics
})

# Step 3: Generate topic names based on top words for each topic
topic_names = []
for i in range(max(topics) + 1):
    top_words = topic_model.get_topic(i)
    top_words = [word for word, _ in top_words]
    topic_names.append(" ".join(top_words))

# Map topic names to the topics
topics_df['Topic Name'] = topics_df['Topic'].map(lambda x: topic_names[x])

# Step 4: Create a Plotly 2D scatter plot
fig = px.scatter(topics_df, x='UMAP_1', y='UMAP_2', color='Topic Name',
                 hover_data=['Topic', 'Topic Name'], title="2D Topic Scatter Plot (Colorized by Topics)")


# Add Topic Assignments and Topic Names to DataFrame
df['Topic'] = topics
df['Topic Name'] = df['Topic'].map(lambda x: topic_names[x])

# Select relevant columns for the HTML table
table_data = df[['title', 'abstract', 'Topic', 'Topic Name']]

#  Save the temporal  Chart Visualization as HTML
temporal_plot = topic_model.visualize_topics_over_time(topics_over_time)

# Create a new single HTML file combining all visualizations and the table
with open("combined_visualizations.html", "w") as f:
    # Write Topic Visualization
    f.write(topic_plot.to_html())
    f.write("<hr>")  # Add a separator line

    # Write Hierarchy Visualization
    f.write(hierarchy_plot.to_html())
    f.write("<hr>")

    # Write Heatmap Visualization
    f.write(heatmap_plot.to_html())
    f.write("<hr>")

    # Write Bar Chart Visualization
    f.write(barchart_plot.to_html())
    f.write("<hr>")

    # Write 2D Topic Scatter Plot
    f.write(fig.to_html())
    f.write("<hr>")

    # Write Temporal Visualization
    f.write(temporal_plot.to_html())
    f.write("<hr>")

    # Write DataFrame as a table
    f.write(table_data.to_html(index=False))

# Download the combined HTML file
files.download("combined_visualizations.html")




[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Data fetched from arXiv.


Batches:   0%|          | 0/10 [00:00<?, ?it/s]

2024-11-10 19:06:24,897 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm


BERT embeddings generated.


2024-11-10 19:06:25,233 - BERTopic - Dimensionality - Completed ✓
2024-11-10 19:06:25,236 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-11-10 19:06:25,260 - BERTopic - Cluster - Completed ✓
2024-11-10 19:06:25,263 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-11-10 19:06:25,345 - BERTopic - Representation - Completed ✓
2024-11-10 19:06:25,349 - BERTopic - Topic reduction - Reducing number of topics
2024-11-10 19:06:25,355 - BERTopic - Topic reduction - Reduced number of topics from 14 to 14
295it [00:03, 81.14it/s]


                  Timestamp  Topic  Frequency
0 1993-06-11 03:26:04+00:00      7          1
1 1999-05-10 20:52:23+00:00      5          2
2 1999-05-10 22:20:40+00:00      5          2
3 1999-08-17 21:32:41+00:00      5          2
4 1999-08-17 22:49:19+00:00      5          2
Assigned topics: [9, 8, -1, 12, 8, 4, 8, -1, 8, 12, 9, 12, 4, 4, 8, 4, -1, 5, -1, -1, 9, 8, 4, 12, 8, -1, 9, 8, 7, 8, 9, 12, -1, 5, -1, 8, 8, 12, 4, -1, 1, -1, -1, 2, -1, 2, -1, 8, 8, -1, 8, 8, -1, 1, -1, 12, 4, 10, 8, -1, -1, 8, 7, 11, 7, -1, 12, 0, -1, -1, 4, 1, 0, -1, -1, 4, -1, 4, 12, 12, 4, 4, -1, 4, 7, -1, 11, 8, 4, 9, 4, 4, 0, 10, -1, 12, -1, 12, 12, 11, 4, 4, -1, 4, 4, 9, 10, 4, 4, 4, 8, 4, 8, -1, 6, -1, 4, 4, -1, 0, 5, -1, 5, 4, -1, 5, 4, 10, 9, 4, 9, 11, 4, 4, 9, -1, 4, 11, -1, -1, -1, 6, 6, 4, -1, -1, 6, 6, 10, 4, 11, 9, 9, -1, -1, 4, 6, 7, 4, -1, -1, 4, -1, 4, -1, 7, -1, 1, -1, -1, 12, 4, 5, 8, -1, -1, 10, 10, 7, -1, 4, 4, 1, 5, -1, -1, -1, -1, 8, -1, -1, 0, 4, -1, 4, 4, -1, -1, -1, -1, -1, 3, 12, 4, -1

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>