In this code we are calculating the sentiments of posts and plotting them. After that we are checking the names and and posts of the local extremes.

In [2]:
import pandas as pd
import plotly.graph_objs as go
from plotly.subplots import make_subplots
from pymongo import MongoClient
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import plotly.io as pio

Version of the code with sorted out posts:

In [3]:
# Set Plotly renderer
pio.renderers.default = 'browser'

# MongoDB connection
client = MongoClient("mongodb+srv://matejasmiljanic:AfgQNglyyyJYMTJh@cluster0.xiwcv4m.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0")
db = client['Martin_DB_1']  # Replace with your database name

# Load posts collection
posts = pd.DataFrame(list(db.posts.find()))
users = pd.DataFrame(list(db.users.find()))

# Define function to filter out unwanted posts
def clean_posts(post):
    if len(post) < 15:  # Remove posts shorter than 15 characters
        return None
    cleaned_post = post.replace('-', '')  # Replace all '-' symbols with an empty string
    if cleaned_post.strip() == '':  # Check if the post is empty after removing '-'
        return None
    return cleaned_post

# Apply the filter function to remove unwanted posts
posts['cleaned_desc'] = posts['desc'].apply(clean_posts)
posts = posts.dropna(subset=['cleaned_desc'])  # Remove rows with None in 'cleaned_desc' column

# Initialize SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

# Function to analyze sentiment
def analyze_sentiment(text):
    score = analyzer.polarity_scores(text)
    return score['compound']  # Return the compound score

# Analyze sentiment for each cleaned post
posts['sentiment'] = posts['cleaned_desc'].apply(analyze_sentiment)

# Reset index to get a sequential label for filtered posts
posts = posts.reset_index(drop=True)
posts['post_label'] = 'post ' + (posts.index + 1).astype(str)

# Convert ObjectId to string
posts['userId'] = posts['userId'].astype(str)

# Simplify user IDs for filtered posts
user_mapping = {user_id: f'User {i + 1}' for i, user_id in enumerate(posts['userId'].unique())}
posts['user_label'] = posts['userId'].map(user_mapping)

# Save mapped posts and user labels if needed for future reference
# Example of saving to a CSV or other file format if needed:
posts[['post_label', 'cleaned_desc', 'sentiment', 'user_label']].to_csv('filtered_posts.csv', index=False)


Plotting for users by user no.:

In [4]:
# Create a figure
fig = make_subplots()

# Plot each user's posts
for user_id, user_label in user_mapping.items():
    user_posts = posts[posts['userId'] == user_id]
    
    # Plot only the user's posts as a line connecting their dots
    fig.add_trace(go.Scatter(
        x=user_posts.index + 1,    # Plot the post number (linear sequence)
        y=user_posts['sentiment'],  # Plot the sentiment score
        mode='lines+markers',          # Lines and markers
        name=user_label,               # User label for legend
        line=dict(width=1),            # Thin line
        marker=dict(size=8)            # Adjust marker size
    ))

# Update layout for better readability
fig.update_layout(
    title='Sentiment of posts per User',
    xaxis_title='posts (Sequential)',
    yaxis_title='Sentiment Score',
    xaxis=dict(
        tickmode='linear',            # Keep linear tick mode
        tickvals=list(range(1, len(posts) + 1)),  # Ticks for each post
        ticktext=[str(i) for i in range(1, len(posts) + 1)],  # Custom tick text
        range=[1, len(posts)],     # Set x-axis range to start at 1
        showgrid=True,                # Optionally show grid lines for better readability
        zeroline=True                 # Optionally show a line at y=0
    ),
    yaxis=dict(
        autorange=True,               # Automatically adjust y-axis range
    ),
    legend_title="Users",
    hovermode="closest",
    width=1600,                    # Increase chart width for clarity
    height=600,
    margin=dict(l=40, r=40, t=50, b=40)  # Add margins to avoid clutter
)

# Show the plot
fig.show()

Finding specific user's unfo:

In [17]:
# Find specific post details
# post (xy) by User (vw)
specific_post = posts[(posts['post_label'] == 'post 7743') & (posts['user_label'] == 'User 79')]

if not specific_post.empty:
    post_text = specific_post['desc'].values[0]
    user_name = specific_post['username'].values[0]
    print(f"post Number 7743 by {user_name}:\n{post_text}")
else:
    print("post not found.")


Comment Number 7743 by Dunkelgrauer Pinguin:
Your message: "Great points! Scientific communication is indeed essential in making science accessible and engaging for everyone. It's important for scientists and media outlets to be transparent and accurate in their reporting to ensure that the public has a clear understanding of the science behind the issues. In involving the public in scientific research and experiments can also help to increase their understanding of science and foster a sense of ownership and investment in scientific discoveries. Let's continue this conversation and explore more ideas on how we can make science more accessible and engaging for everyone!-----------------Your message: "Great points! Scientific communication is indeed essential in making science accessible and engaging for everyone. It's important for scientists and media outlets to be transparent and accurate in their reporting to ensure that the public has a clear understanding of the science behind the

Topic modelling:

In [5]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim import corpora
from gensim.models import LdaModel
import re
import nltk
nltk.data.path.append(r'C:\Users\matej\AppData\Roaming\nltk_data')

# Download stopwords if not already downloaded
#nltk.download('stopwords')
#nltk.download('punkt')

# Set up stop words
stop_words = set(stopwords.words('english'))

# Preprocess the post text for topic modeling
def preprocess_text(text):
    # Lowercase, remove special characters, and tokenize
    text = text.lower()
    text = re.sub(r'\W+', ' ', text)  # Remove non-word characters
    words = text.split()
    
    # Remove stopwords and short words
    words = [word for word in words if word not in stop_words and len(word) > 2]
    return words

# Apply preprocessing to each post
posts['processed_desc'] = posts['cleaned_desc'].apply(preprocess_text)

# Create a dictionary and corpus for LDA
dictionary = corpora.Dictionary(posts['processed_desc'])
corpus = [dictionary.doc2bow(text) for text in posts['processed_desc']]

# Train the LDA model
num_topics = 5  # Define the number of topics
lda_model = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15, random_state=42)

# Display the main topics
#for idx, topic in lda_model.print_topics(-1):
  #  print(f"Topic {idx + 1}: {topic}")


In [6]:
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis

# Visualize the topics
lda_vis = gensimvis.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(lda_vis)


Plotting the sentiments per post for users with their names attached:

In [8]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

# Ensure the columns are of the same type and strip whitespace
posts['userId'] = posts['userId'].astype(str).str.strip()
users['_id'] = users['_id'].astype(str).str.strip()

# Print unique values to check for mismatches
print("Unique userIds in posts:", posts['userId'].unique())
print("Unique _id in users:", users['_id'].unique())

# Merge posts with users to get usernames using the correct column name
merged_data = posts.merge(users[['_id', 'username']], left_on='userId', right_on='_id', how='left')

# Check for NaN values in merged usernames
print("Number of NaN usernames after merge:", merged_data['username'].isna().sum())

# Create a figure
fig = make_subplots()

# Plot each user's posts using actual usernames from the merged DataFrame
for user_id in merged_data['userId'].unique():
    username = merged_data.loc[merged_data['userId'] == user_id, 'username'].iloc[0]
    user_posts = merged_data[merged_data['userId'] == user_id]
    
    fig.add_trace(go.Scatter(
        x=user_posts.index + 1,
        y=user_posts['sentiment'],
        mode='lines+markers',
        name=username,
        line=dict(width=1),
        marker=dict(size=8)
    ))

# Update layout for better readability
fig.update_layout(
    title='Sentiment of Posts per User',
    xaxis_title='Posts (Sequential)',
    yaxis_title='Sentiment Score',
    xaxis=dict(
        tickmode='linear',
        tickvals=list(range(1, len(merged_data) + 1)),
        ticktext=[str(i) for i in range(1, len(merged_data) + 1)],
        range=[1, len(merged_data)],
        showgrid=True,
        zeroline=True
    ),
    yaxis=dict(
        autorange=True,
    ),
    legend_title="Users",
    hovermode="closest",
    width=1600,
    height=600,
    margin=dict(l=40, r=40, t=50, b=40)
)

# Show the plot
fig.show()


Unique userIds in posts: ['671fd411d83590a3c09cab76' '671fd412d83590a3c09cab84'
 '671fd411d83590a3c09cab7c' '671fd411d83590a3c09cab7d'
 '671fd411d83590a3c09cab78' '671fd411d83590a3c09cab77'
 '671fd412d83590a3c09cab81' '671fd411d83590a3c09cab7f'
 '671fd411d83590a3c09cab79' '671fd411d83590a3c09cab7e'
 '671fd412d83590a3c09cab83' '671fd411d83590a3c09cab7a'
 '671fd411d83590a3c09cab80' '671fd412d83590a3c09cab82']
Unique _id in users: ['671fd411d83590a3c09cab76' '671fd411d83590a3c09cab77'
 '671fd411d83590a3c09cab78' '671fd411d83590a3c09cab79'
 '671fd411d83590a3c09cab7a' '671fd411d83590a3c09cab7b'
 '671fd411d83590a3c09cab7c' '671fd411d83590a3c09cab7d'
 '671fd411d83590a3c09cab7e' '671fd411d83590a3c09cab7f'
 '671fd411d83590a3c09cab80' '671fd412d83590a3c09cab81'
 '671fd412d83590a3c09cab82' '671fd412d83590a3c09cab83'
 '671fd412d83590a3c09cab84']
Number of NaN usernames after merge: 0


Connecting the names to roles and than graphing by the roles:

In [23]:
import re
import json

# Function to extract roles from the text file
def extract_roles_from_file(file_path):
    roles = []
    
    # Read the content of the file
    with open(file_path, 'r') as file:
        lines = file.readlines()
        
        for line in lines:
            # Use regex to find the persona value
            match = re.search(r"persona:\s*'([^']+)'", line)
            if match:
                roles.append(match.group(1))  # Append the extracted persona to the list

    return roles

# Specify the path to your text file
file_path = 'roles_with_agents.txt'

# Get the roles
roles_array = extract_roles_from_file(file_path)

# Print the roles array
print(roles_array)

['academic communications', 'academic researcher', 'academic researcher', 'academic researcher', 'academic researcher', 'academic science', 'academic science', 'academic science', 'academic writer', 'biologist', 'biology', 'biotechnology researcher', 'climate science', 'editor', 'editor', 'editor', 'educator', 'entry-level biology', 'environmental science', 'environmental', 'environmental', 'general-interest science', 'journalist', 'journalist', 'medical', 'microbiologist', 'microbiologist', 'molecular biologist', 'news aggregator', 'research coordinator', 'science communications', 'science communicator,', 'science communicator', 'science communicator', 'science communicator', 'science communicator', 'science editor', 'science editor', 'science editor', 'science editor', 'science enthusiast', 'science journalist', 'science journalist', 'science journalist', 'science journalist', 'science journalist', 'science journalist', 'science journalist', 'science journalist', 'science journalist'

In [24]:
import re

# Function to extract names from the text file
def extract_names_from_file(file_path):
    names = []
    
    # Read the content of the file
    with open(file_path, 'r') as file:
        lines = file.readlines()
        
        for line in lines:
            # Use regex to find the name value
            match = re.search(r'=\s*"([^"]+)"', line)
            if match:
                names.append(match.group(1))  # Append the extracted name to the list

    return names

# Specify the path to your text file
file_path = 'names_with_agents.txt'

# Get the names
names_array = extract_names_from_file(file_path)

# Print the names array
print(names_array)


['Dunkelschieferblaue Biene', 'Sienna-Kuh', 'Darkslategray-Wissenschaftler', 'Dunkelschiefergraue Katze', 'Silberner BÃ¤r', 'SeegrÃ¼ner Papagei', 'Kadettenblaues Siegel', 'Dunkelschieferblaue Giraffe', 'Hellschiefergrauer LÃ¶we', 'Dunkelkhakifarbenes Rentier', 'Siena Biene', 'Peru-BÃ¼ffel', 'Peruanisches ErdmÃ¤nnchen', 'DÃ¼sterer Professor', 'Rosabraunes Zebra', 'Dunkelschiefergraue Kuh', 'Grauer Koala', 'Dunkelgrauer Orca', 'Darkslategray-Wissenschaftler', 'Silberner BlauhÃ¤her', 'Graues Huhn', 'Dunkelschiefergrauer Frosch', 'Silberner WaschbÃ¤r', 'Rosabraunes Lamm', 'BlaugrÃ¼ne Blume', 'Schwarzer Biker', 'Dunkelschiefergrauer Fuchs', 'Dunkelgrauer Igel', 'Schiefergraue Eule', 'Dunkelschiefergrauer Hund', 'Rosabrauner Fuchs', 'Himmelblauer Delphin', 'Dunkellachs-Warzenschwein', 'Rosabrauner Vogel', 'Dunkelschiefergraues Schwein', 'Schwarzer Tiger', 'Indigo-Truthahn', 'Schwarzer Astronaut', 'Nachtblauer Cowboy', 'Hellstahlblaue Krabbe', 'Dunkelschiefergraues Siegel', 'Dunkelschiefergra

In [28]:

import pandas as pd
from plotly.subplots import make_subplots
import plotly.graph_objects as go

# Map usernames to their roles
user_role_mapping = dict(zip(names_array, roles_array))


# Map each post's username to its role
posts['role'] = posts['username'].map(user_role_mapping)

# Create a figure
fig = make_subplots()

# Plot each role's posts without connecting individual users' posts
for role in posts['role'].unique():
    # Filter posts for the current role
    role_posts = posts[posts['role'] == role]
    
    # Plot posts for this role as individual points
    fig.add_trace(go.Scatter(
        x=role_posts.index + 1,    # Sequential post order
        y=role_posts['sentiment'],  # Sentiment score
        mode='lines+markers',          # Lines and markers
        name=role,                     # Role label for legend
        marker=dict(size=8),            # Adjust marker size
        line=dict(width=0.7)            # Thin line
    ))

# Update layout for better readability
fig.update_layout(
    title='Sentiment of posts per Role',
    xaxis_title='posts (Sequential)',
    yaxis_title='Sentiment Score',
    xaxis=dict(
        tickmode='linear',               # Keep linear tick mode
        tickvals=list(range(1, len(posts) + 1)),  # Ticks for each post
        ticktext=[str(i) for i in range(1, len(posts) + 1)],  # Custom tick text
        range=[1, len(posts)],        # Set x-axis range to start at 1
        showgrid=True,                   # Optionally show grid lines for readability
        zeroline=True                    # Optionally show a line at y=0
    ),
    yaxis=dict(
        autorange=True,                  # Automatically adjust y-axis range
    ),
    legend_title="Roles",
    hovermode="closest",
    width=1600,                          # Increase chart width for clarity
    height=600,
    margin=dict(l=40, r=40, t=50, b=40)  # Add margins to avoid clutter
)

# Show the plot
fig.show()
