In this code we are calculating the sentiments of comments and plotting them. After that we are checking the names and and comments of the local extremes.

In [1]:
import pandas as pd
import plotly.graph_objs as go
from plotly.subplots import make_subplots
from pymongo import MongoClient
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import plotly.io as pio

Version of the code with sorted out comments:

In [None]:
# Set Plotly renderer
pio.renderers.default = 'browser'

# MongoDB connection
client = MongoClient("")
db = client['ttt4']  # Replace with your database name

# Load comments collection
comments = pd.DataFrame(list(db.comments.find()))

# Define function to filter out unwanted comments
def clean_comments(comment):
    if len(comment) < 15:  # Remove comments shorter than 15 characters
        return None
    cleaned_comment = comment.replace('-', '')  # Replace all '-' symbols with an empty string
    if cleaned_comment.strip() == '':  # Check if the comment is empty after removing '-'
        return None
    return cleaned_comment

# Apply the filter function to remove unwanted comments
comments['cleaned_body'] = comments['body'].apply(clean_comments)
comments = comments.dropna(subset=['cleaned_body'])  # Remove rows with None in 'cleaned_body' column

# Initialize SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

# Function to analyze sentiment
def analyze_sentiment(text):
    score = analyzer.polarity_scores(text)
    return score['compound']  # Return the compound score

# Analyze sentiment for each cleaned comment
comments['sentiment'] = comments['cleaned_body'].apply(analyze_sentiment)

# Reset index to get a sequential label for filtered comments
comments = comments.reset_index(drop=True)
comments['comment_label'] = 'Comment ' + (comments.index + 1).astype(str)

# Convert ObjectId to string
comments['userId'] = comments['userId'].astype(str)

# Simplify user IDs for filtered comments
user_mapping = {user_id: f'User {i + 1}' for i, user_id in enumerate(comments['userId'].unique())}
comments['user_label'] = comments['userId'].map(user_mapping)

# Save mapped comments and user labels if needed for future reference
# Example of saving to a CSV or other file format if needed:
comments[['comment_label', 'cleaned_body', 'sentiment', 'user_label']].to_csv('filtered_comments.csv', index=False)


Plotting for users by user no.:

In [6]:
# Create a figure
fig = make_subplots()

# Plot each user's comments
for user_id, user_label in user_mapping.items():
    user_comments = comments[comments['userId'] == user_id]
    
    # Plot only the user's comments as a line connecting their dots
    fig.add_trace(go.Scatter(
        x=user_comments.index + 1,    # Plot the comment number (linear sequence)
        y=user_comments['sentiment'],  # Plot the sentiment score
        mode='lines+markers',          # Lines and markers
        name=user_label,               # User label for legend
        line=dict(width=1),            # Thin line
        marker=dict(size=8)            # Adjust marker size
    ))

# Update layout for better readability
fig.update_layout(
    title='Sentiment of Comments per User',
    xaxis_title='Comments (Sequential)',
    yaxis_title='Sentiment Score',
    xaxis=dict(
        tickmode='linear',            # Keep linear tick mode
        tickvals=list(range(1, len(comments) + 1)),  # Ticks for each comment
        ticktext=[str(i) for i in range(1, len(comments) + 1)],  # Custom tick text
        range=[1, len(comments)],     # Set x-axis range to start at 1
        showgrid=True,                # Optionally show grid lines for better readability
        zeroline=True                 # Optionally show a line at y=0
    ),
    yaxis=dict(
        autorange=True,               # Automatically adjust y-axis range
    ),
    legend_title="Users",
    hovermode="closest",
    width=1600,                    # Increase chart width for clarity
    height=600,
    margin=dict(l=40, r=40, t=50, b=40)  # Add margins to avoid clutter
)

# Show the plot
fig.show()

Finding specific user's unfo:

In [17]:
# Find specific comment details
# Comment (xy) by User (vw)
specific_comment = comments[(comments['comment_label'] == 'Comment 7743') & (comments['user_label'] == 'User 79')]

if not specific_comment.empty:
    comment_text = specific_comment['body'].values[0]
    user_name = specific_comment['username'].values[0]
    print(f"Comment Number 7743 by {user_name}:\n{comment_text}")
else:
    print("Comment not found.")


Comment Number 7743 by Dunkelgrauer Pinguin:
Your message: "Great points! Scientific communication is indeed essential in making science accessible and engaging for everyone. It's important for scientists and media outlets to be transparent and accurate in their reporting to ensure that the public has a clear understanding of the science behind the issues. In involving the public in scientific research and experiments can also help to increase their understanding of science and foster a sense of ownership and investment in scientific discoveries. Let's continue this conversation and explore more ideas on how we can make science more accessible and engaging for everyone!-----------------Your message: "Great points! Scientific communication is indeed essential in making science accessible and engaging for everyone. It's important for scientists and media outlets to be transparent and accurate in their reporting to ensure that the public has a clear understanding of the science behind the

Topic modelling:

In [7]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim import corpora
from gensim.models import LdaModel
import re
import nltk
nltk.data.path.append(r'C:\Users\matej\AppData\Roaming\nltk_data')

# Download stopwords if not already downloaded
#nltk.download('stopwords')
#nltk.download('punkt')

# Set up stop words
stop_words = set(stopwords.words('english'))

# Preprocess the comment text for topic modeling
def preprocess_text(text):
    # Lowercase, remove special characters, and tokenize
    text = text.lower()
    text = re.sub(r'\W+', ' ', text)  # Remove non-word characters
    words = text.split()
    
    # Remove stopwords and short words
    words = [word for word in words if word not in stop_words and len(word) > 2]
    return words

# Apply preprocessing to each comment
comments['processed_body'] = comments['cleaned_body'].apply(preprocess_text)

# Create a dictionary and corpus for LDA
dictionary = corpora.Dictionary(comments['processed_body'])
corpus = [dictionary.doc2bow(text) for text in comments['processed_body']]

# Train the LDA model
num_topics = 5  # Define the number of topics
lda_model = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15, random_state=42)

# Display the main topics
#for idx, topic in lda_model.print_topics(-1):
  #  print(f"Topic {idx + 1}: {topic}")


Topic 1: 0.082*"points" + 0.081*"great" + 0.080*"indeed" + 0.078*"essential" + 0.071*"public" + 0.063*"scientific" + 0.043*"communication" + 0.039*"understanding" + 0.026*"scientists" + 0.026*"important"
Topic 2: 0.085*"public" + 0.057*"understanding" + 0.057*"accurate" + 0.057*"scientific" + 0.057*"communication" + 0.029*"outlets" + 0.029*"clear" + 0.029*"reporting" + 0.029*"transparent" + 0.028*"issues"
Topic 3: 0.083*"scientific" + 0.082*"public" + 0.053*"understanding" + 0.052*"communication" + 0.034*"complex" + 0.034*"findings" + 0.034*"understand" + 0.034*"helping" + 0.034*"concepts" + 0.032*"research"
Topic 4: 0.080*"science" + 0.060*"scientific" + 0.040*"accessible" + 0.040*"engaging" + 0.040*"everyone" + 0.040*"public" + 0.040*"understanding" + 0.021*"ideas" + 0.021*"make" + 0.020*"research"
Topic 5: 0.092*"public" + 0.064*"scientific" + 0.062*"communication" + 0.034*"work" + 0.034*"information" + 0.033*"together" + 0.033*"improve" + 0.033*"unbiased" + 0.033*"ways" + 0.033*"ac

In [16]:
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis

# Visualize the topics
lda_vis = gensimvis.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(lda_vis)


Plotting the sentiments per comment for users with their names attached:

In [18]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

# Create a figure
fig = make_subplots()

# Plot each user's comments using actual usernames from the comments DataFrame
for user_id in comments['userId'].unique():
    # Get the actual username from the DataFrame based on user_id
    username = comments.loc[comments['userId'] == user_id, 'username'].iloc[0]  # Assuming 'username' column has the names
    
    # Filter comments by the current user_id
    user_comments = comments[comments['userId'] == user_id]
    
    # Plot only the user's comments as a line connecting their dots
    fig.add_trace(go.Scatter(
        x=user_comments.index + 1,    # Plot the comment number (linear sequence)
        y=user_comments['sentiment'],  # Plot the sentiment score
        mode='lines+markers',          # Lines and markers
        name=username,                 # Username for legend
        line=dict(width=1),            # Thin line
        marker=dict(size=8)            # Adjust marker size
    ))

# Update layout for better readability
fig.update_layout(
    title='Sentiment of Comments per User',
    xaxis_title='Comments (Sequential)',
    yaxis_title='Sentiment Score',
    xaxis=dict(
        tickmode='linear',            # Keep linear tick mode
        tickvals=list(range(1, len(comments) + 1)),  # Ticks for each comment
        ticktext=[str(i) for i in range(1, len(comments) + 1)],  # Custom tick text
        range=[1, len(comments)],     # Set x-axis range to start at 1
        showgrid=True,                # Optionally show grid lines for better readability
        zeroline=True                 # Optionally show a line at y=0
    ),
    yaxis=dict(
        autorange=True,               # Automatically adjust y-axis range
    ),
    legend_title="Users",
    hovermode="closest",
    width=1600,                      # Increase chart width for clarity
    height=600,
    margin=dict(l=40, r=40, t=50, b=40)  # Add margins to avoid clutter
)

# Show the plot
fig.show()


Connecting the names to roles and than graphing by the roles:

In [23]:
import re
import json

# Function to extract roles from the text file
def extract_roles_from_file(file_path):
    roles = []
    
    # Read the content of the file
    with open(file_path, 'r') as file:
        lines = file.readlines()
        
        for line in lines:
            # Use regex to find the persona value
            match = re.search(r"persona:\s*'([^']+)'", line)
            if match:
                roles.append(match.group(1))  # Append the extracted persona to the list

    return roles

# Specify the path to your text file
file_path = 'roles_with_agents.txt'

# Get the roles
roles_array = extract_roles_from_file(file_path)

# Print the roles array
print(roles_array)

['academic communications', 'academic researcher', 'academic researcher', 'academic researcher', 'academic researcher', 'academic science', 'academic science', 'academic science', 'academic writer', 'biologist', 'biology', 'biotechnology researcher', 'climate science', 'editor', 'editor', 'editor', 'educator', 'entry-level biology', 'environmental science', 'environmental', 'environmental', 'general-interest science', 'journalist', 'journalist', 'medical', 'microbiologist', 'microbiologist', 'molecular biologist', 'news aggregator', 'research coordinator', 'science communications', 'science communicator,', 'science communicator', 'science communicator', 'science communicator', 'science communicator', 'science editor', 'science editor', 'science editor', 'science editor', 'science enthusiast', 'science journalist', 'science journalist', 'science journalist', 'science journalist', 'science journalist', 'science journalist', 'science journalist', 'science journalist', 'science journalist'

In [24]:
import re

# Function to extract names from the text file
def extract_names_from_file(file_path):
    names = []
    
    # Read the content of the file
    with open(file_path, 'r') as file:
        lines = file.readlines()
        
        for line in lines:
            # Use regex to find the name value
            match = re.search(r'=\s*"([^"]+)"', line)
            if match:
                names.append(match.group(1))  # Append the extracted name to the list

    return names

# Specify the path to your text file
file_path = 'names_with_agents.txt'

# Get the names
names_array = extract_names_from_file(file_path)

# Print the names array
print(names_array)


['Dunkelschieferblaue Biene', 'Sienna-Kuh', 'Darkslategray-Wissenschaftler', 'Dunkelschiefergraue Katze', 'Silberner BÃ¤r', 'SeegrÃ¼ner Papagei', 'Kadettenblaues Siegel', 'Dunkelschieferblaue Giraffe', 'Hellschiefergrauer LÃ¶we', 'Dunkelkhakifarbenes Rentier', 'Siena Biene', 'Peru-BÃ¼ffel', 'Peruanisches ErdmÃ¤nnchen', 'DÃ¼sterer Professor', 'Rosabraunes Zebra', 'Dunkelschiefergraue Kuh', 'Grauer Koala', 'Dunkelgrauer Orca', 'Darkslategray-Wissenschaftler', 'Silberner BlauhÃ¤her', 'Graues Huhn', 'Dunkelschiefergrauer Frosch', 'Silberner WaschbÃ¤r', 'Rosabraunes Lamm', 'BlaugrÃ¼ne Blume', 'Schwarzer Biker', 'Dunkelschiefergrauer Fuchs', 'Dunkelgrauer Igel', 'Schiefergraue Eule', 'Dunkelschiefergrauer Hund', 'Rosabrauner Fuchs', 'Himmelblauer Delphin', 'Dunkellachs-Warzenschwein', 'Rosabrauner Vogel', 'Dunkelschiefergraues Schwein', 'Schwarzer Tiger', 'Indigo-Truthahn', 'Schwarzer Astronaut', 'Nachtblauer Cowboy', 'Hellstahlblaue Krabbe', 'Dunkelschiefergraues Siegel', 'Dunkelschiefergra

In [28]:

import pandas as pd
from plotly.subplots import make_subplots
import plotly.graph_objects as go

# Map usernames to their roles
user_role_mapping = dict(zip(names_array, roles_array))


# Map each comment's username to its role
comments['role'] = comments['username'].map(user_role_mapping)

# Create a figure
fig = make_subplots()

# Plot each role's comments without connecting individual users' comments
for role in comments['role'].unique():
    # Filter comments for the current role
    role_comments = comments[comments['role'] == role]
    
    # Plot comments for this role as individual points
    fig.add_trace(go.Scatter(
        x=role_comments.index + 1,    # Sequential comment order
        y=role_comments['sentiment'],  # Sentiment score
        mode='lines+markers',          # Lines and markers
        name=role,                     # Role label for legend
        marker=dict(size=8),            # Adjust marker size
        line=dict(width=0.7)            # Thin line
    ))

# Update layout for better readability
fig.update_layout(
    title='Sentiment of Comments per Role',
    xaxis_title='Comments (Sequential)',
    yaxis_title='Sentiment Score',
    xaxis=dict(
        tickmode='linear',               # Keep linear tick mode
        tickvals=list(range(1, len(comments) + 1)),  # Ticks for each comment
        ticktext=[str(i) for i in range(1, len(comments) + 1)],  # Custom tick text
        range=[1, len(comments)],        # Set x-axis range to start at 1
        showgrid=True,                   # Optionally show grid lines for readability
        zeroline=True                    # Optionally show a line at y=0
    ),
    yaxis=dict(
        autorange=True,                  # Automatically adjust y-axis range
    ),
    legend_title="Roles",
    hovermode="closest",
    width=1600,                          # Increase chart width for clarity
    height=600,
    margin=dict(l=40, r=40, t=50, b=40)  # Add margins to avoid clutter
)

# Show the plot
fig.show()
