# Import Data

In [62]:
import numpy as np

In [63]:
import pandas as pd

In [64]:
# Load the CSV file into a DataFrame with the correct encoding
uniswap = pd.read_csv('/content/Uniswap - Governance - 🏛│governance [755969053280960533].csv', encoding='ISO-8859-1')

# Assume uniswap is the DataFrame you want to remove unnamed columns from
uniswap = uniswap.loc[:, ~uniswap.columns.str.startswith('Unnamed:')]

In [65]:
uniswap.head()

Unnamed: 0,AuthorID,Author,Date,Content,Attachments,Reactions
0,274208745699934220,callil_,09/17/2020 9:52 AM,Welcome to the governance channel! This channe...,,ð¦ (6)
1,274208745699934220,callil_,09/17/2020 9:52 AM,Pinned a message.,,
2,171049984060424193,chevispreston,09/17/2020 10:25 AM,This is the channel to be in :smug:,,ð (2)
3,394328926966775820,spritemoney,09/17/2020 10:28 AM,Make your token Liquidity locked!!,,
4,171049984060424193,chevispreston,09/17/2020 10:29 AM,"Waiting for all of the ""pls incentivize sushis...",,ð (1)


In [66]:
uniswap.tail()

Unnamed: 0,AuthorID,Author,Date,Content,Attachments,Reactions
11578,571786929255874641,waterfound,08/12/2023 6:18 AM,"I think it was a long time ago, reaching by di...",,
11579,887430423238561823,ibracadabra0000,08/12/2023 3:05 PM,how can i particioate in governance votings?,,
11580,887430423238561823,ibracadabra0000,08/12/2023 3:09 PM,i just must have UNI on Ethereum mainnet? or c...,,
11581,571786929255874641,waterfound,08/13/2023 2:32 AM,On ETH,,ð (1)
11582,471357824299958273,crypto_rachel,08/13/2023 4:20 AM,The Uni tokens need to be on the ETH network a...,,ð (1)


# Text Processing

In [67]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import re

# Download stopwords and stemmer
nltk.download('stopwords')
nltk.download('punkt')

# Define the preprocessing function
stop_words = set(stopwords.words('english'))
def preprocess_text(text):
    # Check if text is NaN
    if pd.isna(text):
        return ""

    # Convert to lowercase
    text = text.lower()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Remove non-alphabetic characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenize the text
    words = word_tokenize(text)

    # Remove stop words and punctuation
    words = [word for word in words if word not in stop_words and word.isalpha()]

    # Join the remaining words into a string
    text = ' '.join(words)

    return text

# Apply the preprocessing function to the 'Content' column
uniswap['Preprocessed'] = uniswap['Content'].apply(preprocess_text)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [68]:
uniswap.tail()

Unnamed: 0,AuthorID,Author,Date,Content,Attachments,Reactions,Preprocessed
11578,571786929255874641,waterfound,08/12/2023 6:18 AM,"I think it was a long time ago, reaching by di...",,,think long time ago reaching discord presentin...
11579,887430423238561823,ibracadabra0000,08/12/2023 3:05 PM,how can i particioate in governance votings?,,,particioate governance votings
11580,887430423238561823,ibracadabra0000,08/12/2023 3:09 PM,i just must have UNI on Ethereum mainnet? or c...,,,must uni ethereum mainnet uni polygon vote mus...
11581,571786929255874641,waterfound,08/13/2023 2:32 AM,On ETH,,ð (1),eth
11582,471357824299958273,crypto_rachel,08/13/2023 4:20 AM,The Uni tokens need to be on the ETH network a...,,ð (1),uni tokens need eth network delegated address


In [69]:
# Save the cleaned DataFrame to a new CSV file
uniswap.to_csv('cleaned_discord_uniswap_governance.csv', index=False)

# Sentiment Analysis (VADER)

## Date-based Sentiment

In [70]:
! pip install vaderSentiment



In [71]:
uniswap.dropna(subset=['Date'], inplace=True)

In [72]:
uniswap.tail()

Unnamed: 0,AuthorID,Author,Date,Content,Attachments,Reactions,Preprocessed
11578,571786929255874641,waterfound,08/12/2023 6:18 AM,"I think it was a long time ago, reaching by di...",,,think long time ago reaching discord presentin...
11579,887430423238561823,ibracadabra0000,08/12/2023 3:05 PM,how can i particioate in governance votings?,,,particioate governance votings
11580,887430423238561823,ibracadabra0000,08/12/2023 3:09 PM,i just must have UNI on Ethereum mainnet? or c...,,,must uni ethereum mainnet uni polygon vote mus...
11581,571786929255874641,waterfound,08/13/2023 2:32 AM,On ETH,,ð (1),eth
11582,471357824299958273,crypto_rachel,08/13/2023 4:20 AM,The Uni tokens need to be on the ETH network a...,,ð (1),uni tokens need eth network delegated address


In [73]:
UNISWAP = uniswap
UNISWAP.head()

Unnamed: 0,AuthorID,Author,Date,Content,Attachments,Reactions,Preprocessed
0,274208745699934220,callil_,09/17/2020 9:52 AM,Welcome to the governance channel! This channe...,,ð¦ (6),welcome governance channel channel open discus...
1,274208745699934220,callil_,09/17/2020 9:52 AM,Pinned a message.,,,pinned message
2,171049984060424193,chevispreston,09/17/2020 10:25 AM,This is the channel to be in :smug:,,ð (2),channel smug
3,394328926966775820,spritemoney,09/17/2020 10:28 AM,Make your token Liquidity locked!!,,,make token liquidity locked
4,171049984060424193,chevispreston,09/17/2020 10:29 AM,"Waiting for all of the ""pls incentivize sushis...",,ð (1),waiting pls incentivize sushiswap come back po...


In [74]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Initialize the sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Convert the 'Date' column to a datetime object and set it as the index
UNISWAP['Date'] = pd.to_datetime(UNISWAP['Date'], errors='coerce')
UNISWAP.set_index('Date', inplace=True)

# Define a function to compute the sentiment score for each day
def daily_sentiment_score(text):
    sentiment = analyzer.polarity_scores(text)
    return sentiment['compound']

# Compute the daily sentiment score using resample and apply
uniswap_daily = UNISWAP['Preprocessed'].resample('D').apply(lambda x: x.apply(daily_sentiment_score).mean())


In [75]:
uniswap_daily

Date
2020-09-17    0.165781
2020-09-18    0.181287
2020-09-19    0.175715
2020-09-20    0.202875
2020-09-21    0.097097
                ...   
2023-08-09         NaN
2023-08-10         NaN
2023-08-11         NaN
2023-08-12    0.015317
2023-08-13    0.000000
Freq: D, Name: Preprocessed, Length: 1061, dtype: float64

In [76]:
uniswap_daily_sentiment = uniswap_daily.to_frame(name='Sentiment Score').reset_index()

In [77]:
uniswap_daily_sentiment.columns

Index(['Date', 'Sentiment Score'], dtype='object')

In [78]:
uniswap_daily_sentiment.head()

Unnamed: 0,Date,Sentiment Score
0,2020-09-17,0.165781
1,2020-09-18,0.181287
2,2020-09-19,0.175715
3,2020-09-20,0.202875
4,2020-09-21,0.097097


In [79]:
uniswap_daily_sentiment['Sentiment Score'].fillna(0.000000, inplace=True)

In [80]:
uniswap_daily_sentiment

Unnamed: 0,Date,Sentiment Score
0,2020-09-17,0.165781
1,2020-09-18,0.181287
2,2020-09-19,0.175715
3,2020-09-20,0.202875
4,2020-09-21,0.097097
...,...,...
1056,2023-08-09,0.000000
1057,2023-08-10,0.000000
1058,2023-08-11,0.000000
1059,2023-08-12,0.015317


In [81]:
uniswap_daily_sentiment.to_csv('sentiment_discord_uniswap_governance.csv', index=False)

In [82]:
import plotly.graph_objs as go

# Create a line graph using Plotly
fig = go.Figure()

# Add a line trace to the figure
fig.add_trace(go.Scatter(x=uniswap_daily_sentiment['Date'], y=uniswap_daily_sentiment['Sentiment Score'], mode='lines'))

# Customize the layout of the figure
fig.update_layout(
    xaxis_title='Date',
    yaxis_title='Sentiment Score',
    xaxis=dict(title_font=dict(size=20), tickfont=dict(size=14), showline=True, linewidth=2, linecolor='black'),
    yaxis=dict(title_font=dict(size=20), tickfont=dict(size=14), showline=True, linewidth=2, linecolor='black')
)

# Display the figure
fig.show()


In [83]:
positive_count = len(uniswap_daily_sentiment[uniswap_daily_sentiment['Sentiment Score'] > 0])
negative_count = len(uniswap_daily_sentiment[uniswap_daily_sentiment['Sentiment Score'] < 0])


In [84]:
import plotly.express as px

# Calculate positive and negative sentiment counts
positive_count = (uniswap_daily_sentiment['Sentiment Score'] > 0).sum()
negative_count = (uniswap_daily_sentiment['Sentiment Score'] < 0).sum()

# Create a dataframe for histogram
hist_uniswap = pd.DataFrame({
    'Sentiment': ['Positive', 'Negative'],
    'Count': [positive_count, negative_count]
})

# Create a plotly histogram
fig = px.histogram(hist_uniswap, x='Sentiment', y='Count', color='Sentiment',
                   title='Uniswap Discord Sentiment Distribution')

# Set y-axis label
fig.update_layout(yaxis_title='Count (days)')

# Add count number on each bar
fig.update_traces(texttemplate='%{y}', textposition='outside')

fig.show()


In [85]:
# Calculate overall statistics on the sentiment scores
overall_sentiment_stats = uniswap_daily_sentiment['Sentiment Score'].describe()

# Output overall sentiment statistics
print("Overall Sentiment Statistics:")
print(overall_sentiment_stats)

Overall Sentiment Statistics:
count    1061.000000
mean        0.153916
std         0.232260
min        -0.517050
25%         0.000000
50%         0.012157
75%         0.254440
max         0.991300
Name: Sentiment Score, dtype: float64


## User-based Sentiment

In [86]:
# Create a new column to store sentiment scores
UNISWAP['Sentiment_Score'] = 0.0

# Group the data based on AuthorID
grouped = UNISWAP.groupby('AuthorID')

# Iterate through each group
for author, group in grouped:
    # Perform sentiment analysis on the content of each group and store sentiment scores
    sentiment_scores = []
    for content in group['Content']:
        # Check if the content is a string; if not, convert it to a string
        if not isinstance(content, str):
            content = str(content)
        sentiment = analyzer.polarity_scores(content)
        sentiment_scores.append(sentiment['compound'])

    # Assign the average sentiment score to all rows in each group
    UNISWAP.loc[group.index, 'Sentiment_Score'] = sum(sentiment_scores) / len(sentiment_scores)

# Now, your DataFrame will have a new column named 'Sentiment_Score' containing the average sentiment score for each user
# You can further analyze or visualize these sentiment scores as needed

In [87]:
# Calculate the number of messages per user
message_count_per_user = UNISWAP['AuthorID'].value_counts()

# Calculate user count
user_count = len(message_count_per_user)

# Calculate 25th, median (50th), 75th, minimum, and maximum percentiles of message counts
message_count_percentiles = message_count_per_user.describe(percentiles=[0.25, 0.5, 0.75])

# Calculate the mean number of messages per user
mean_messages_per_user = message_count_per_user.mean()

# Calculate the minimum and maximum message counts
min_message_count = message_count_per_user.min()
max_message_count = message_count_per_user.max()

# Calculate the distribution of average sentiment scores per user's message
sentiment_scores_per_user = UNISWAP.groupby('AuthorID')['Sentiment_Score'].mean()

# Print the results
print("User Count:", user_count)
print("25th Percentile of Message Counts:", message_count_percentiles['25%'])
print("Median (50th Percentile) of Message Counts:", message_count_percentiles['50%'])
print("75th Percentile of Message Counts:", message_count_percentiles['75%'])
print("Minimum Message Count:", min_message_count)
print("Maximum Message Count:", max_message_count)
print("Mean Number of Messages per User:", mean_messages_per_user)
print("Distribution of Average Sentiment Scores per User's Message:")
print(sentiment_scores_per_user)

User Count: 1001
25th Percentile of Message Counts: 1.0
Median (50th Percentile) of Message Counts: 2.0
75th Percentile of Message Counts: 5.0
Minimum Message Count: 1
Maximum Message Count: 1226
Mean Number of Messages per User: 11.571428571428571
Distribution of Average Sentiment Scores per User's Message:
AuthorID
67321076140478464      0.000000
83699343017644032     -0.421500
95507078788947968      0.625800
95645783323320320      0.171989
98823810068774912      0.191622
                         ...   
1096862148682059937    0.430350
1097252580943470735    0.476700
1099703903727140916    0.238350
1115872498806947902    0.000000
1115990754385481728    0.000000
Name: Sentiment_Score, Length: 1001, dtype: float64


In [88]:
# Convert the Series into a DataFrame with a named column
sentiment_scores_df = sentiment_scores_per_user.reset_index()
sentiment_scores_df.columns = ['AuthorID', 'Average_Sentiment_Score']

# Print the resulting DataFrame
sentiment_scores_df

Unnamed: 0,AuthorID,Average_Sentiment_Score
0,67321076140478464,0.000000
1,83699343017644032,-0.421500
2,95507078788947968,0.625800
3,95645783323320320,0.171989
4,98823810068774912,0.191622
...,...,...
996,1096862148682059937,0.430350
997,1097252580943470735,0.476700
998,1099703903727140916,0.238350
999,1115872498806947902,0.000000


In [89]:
# Count the number of records with sentiment score equal to 0
count_zero_sentiment = (sentiment_scores_df['Average_Sentiment_Score'] == 0).sum()

# Print the count
print("Number of records with sentiment score equal to 0:", count_zero_sentiment)

Number of records with sentiment score equal to 0: 218


In [90]:
# Drop rows where Average_Sentiment_Score is 0
sentiment_scores_df_new = sentiment_scores_df[sentiment_scores_df['Average_Sentiment_Score'] != 0]

# Print the resulting DataFrame
sentiment_scores_df_new

Unnamed: 0,AuthorID,Average_Sentiment_Score
1,83699343017644032,-0.421500
2,95507078788947968,0.625800
3,95645783323320320,0.171989
4,98823810068774912,0.191622
5,101156894864838656,0.361200
...,...,...
992,1043865424799989760,0.038600
993,1062045363705479298,0.440400
996,1096862148682059937,0.430350
997,1097252580943470735,0.476700


In [91]:
import plotly.express as px

# Assuming you have a DataFrame named sentiment_scores_df_new
# If you haven't already created it, please follow the previous response

# Create a histogram
fig = px.histogram(sentiment_scores_df_new, x='Average_Sentiment_Score', title='Distribution of Average Sentiment Scores per User\'s Message')

# Show the histogram
fig.show()

In [92]:
# Define the bins, excluding 0 as a separate interval
bins = [-1.0, -0.5, 0.0, 0.5, 1.0]
hist, bin_edges = pd.cut(sentiment_scores_df_new['Average_Sentiment_Score'], bins, right=False, retbins=True)

# Create a DataFrame to display the frequency table
frequency_table = pd.value_counts(hist, sort=False).reset_index()
frequency_table.columns = ['Sentiment_Score_Range', 'Frequency']

# Calculate the proportion of each bin
total_records = len(sentiment_scores_df_new)
frequency_table['Proportion'] = frequency_table['Frequency'] / total_records

# Display the frequency table
print(frequency_table)

  Sentiment_Score_Range  Frequency  Proportion
0          [-1.0, -0.5)         16    0.020434
1           [-0.5, 0.0)         86    0.109834
2            [0.0, 0.5)        540    0.689655
3            [0.5, 1.0)        141    0.180077


# Discord Volume

In [93]:
# Group the discord messages by day and count the number of discord messages in each day
daily_discord_count = UNISWAP['Content'].resample('D').count()

# Print the daily discord messages count
print(daily_discord_count)

Date
2020-09-17     90
2020-09-18    121
2020-09-19     73
2020-09-20     91
2020-09-21     29
             ... 
2023-08-09      0
2023-08-10      0
2023-08-11      0
2023-08-12      6
2023-08-13      2
Freq: D, Name: Content, Length: 1061, dtype: int64


In [94]:
daily_discord_count = daily_discord_count.to_frame(name='Discord Volume').reset_index()

In [95]:
daily_discord_count

Unnamed: 0,Date,Discord Volume
0,2020-09-17,90
1,2020-09-18,121
2,2020-09-19,73
3,2020-09-20,91
4,2020-09-21,29
...,...,...
1056,2023-08-09,0
1057,2023-08-10,0
1058,2023-08-11,0
1059,2023-08-12,6


In [96]:
daily_discord_count.to_csv('discord_volume_uniswap_governance.csv', index=False)

In [97]:
import plotly.graph_objs as go

# Create a line graph using Plotly
fig = go.Figure()

# Add a line trace to the figure
fig.add_trace(go.Scatter(x=daily_discord_count['Date'], y=daily_discord_count['Discord Volume'], mode='lines'))

# Customize the layout of the figure
fig.update_layout(title='Uniswap Discord Daily Message Volume', xaxis_title='Date', yaxis_title='Uniswap Discord Message Volume')

# Display the figure
fig.show()


In [98]:
# Discord Volume

# Group the discord messages by day and count the number of discord messages in each day
daily_discord_count = UNISWAP['Content'].resample('D').count()

# Function to detect outliers in the daily Discord message count
def detect_outliers(data):
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = (data < lower_bound) | (data > upper_bound)
    return outliers

# Detect outliers in Discord message count using the above function
outliers = detect_outliers(daily_discord_count)

# Remove outliers from the daily Discord message count
daily_discord_count_cleaned = daily_discord_count[~outliers]

# Save the cleaned data to a CSV
daily_discord_count_cleaned.to_csv('cleaned_discord_volume_uniswap_governance.csv', index=False)

In [99]:
daily_discord_count_cleaned = daily_discord_count_cleaned.to_frame(name='Discord Volume').reset_index()

In [100]:
# Create a line graph for Discord message count with and without outliers
fig = go.Figure()

# Add a line without outliers
fig.add_trace(go.Scatter(x=daily_discord_count_cleaned['Date'], y=daily_discord_count_cleaned['Discord Volume'], mode='lines', name='Cleaned'))

# Customize the layout of the figure with increased font sizes
fig.update_layout(
    xaxis_title='Date',
    yaxis_title='Discord Message Volume (Count)',
    xaxis=dict(title_font=dict(size=20), tickfont=dict(size=14), showline=True, linewidth=2, linecolor='black'),
    yaxis=dict(title_font=dict(size=20), tickfont=dict(size=14), showline=True, linewidth=2, linecolor='black')
)

# Display the figure
fig.show()