# Import Data

In [16]:
import numpy as np

In [17]:
import pandas as pd

In [18]:
# Load the CSV file into a DataFrame with the correct encoding
yfi = pd.read_csv('/content/Yearn Talk - general - 🗳governance [734805853768777738].csv', encoding='ISO-8859-1')

# Assume yfi is the DataFrame you want to remove unnamed columns from
yfi = yfi.loc[:, ~yfi.columns.str.startswith('Unnamed:')]

In [19]:
yfi.head()

Unnamed: 0,AuthorID,Author,Date_original,Date,Content,Attachments,Reactions
0,1.77762e+17,mrdunkirk,2020/7/20 ä¸å12:26,2020/7/20,https://gov.yearn.finance/t/proposal-0-yfi-sup...,,
1,1.77762e+17,mrdunkirk,2020/7/20 ä¸å12:26,2020/7/20,https://gov.yearn.finance/t/proposal-1-yfi-fee...,,
2,1.77762e+17,mrdunkirk,2020/7/20 ä¸å12:26,2020/7/20,https://gov.yearn.finance/t/proposal-5-reducin...,,
3,6.46663e+17,Cr3ateD#0001,2020/7/20 ä¸å12:27,2020/7/20,"should we, um. fund an audit?",,ð¯ (2)
4,1.77762e+17,mrdunkirk,2020/7/20 ä¸å12:27,2020/7/20,i would vote for that,,


In [20]:
yfi.tail()

Unnamed: 0,AuthorID,Author,Date_original,Date,Content,Attachments,Reactions
13618,1.12535e+17,j_._,2023/9/12 ä¸å12:42,2023/9/12,Why? Discourse is fine,,
13619,4.15472e+17,.drose01,2023/9/18 ä¸å8:29,2023/9/18,Does yearn finance not develop mobile apps?,,
13620,1.12535e+17,j_._,2023/9/18 ä¸å8:44,2023/9/18,nope. should be responsive on mobile anyway so...,,
13621,1.12535e+17,j_._,2023/9/18 ä¸å8:44,2023/9/18,"also, wrong channel to ask this",,
13622,4.15472e+17,.drose01,2023/9/18 ä¸å10:07,2023/9/18,I see.,,


# Text Processing

In [21]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import re

# Download stopwords and stemmer
nltk.download('stopwords')
nltk.download('punkt')

# Define the preprocessing function
stop_words = set(stopwords.words('english'))
def preprocess_text(text):
    # Check if text is NaN
    if pd.isna(text):
        return ""

    # Convert to lowercase
    text = text.lower()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Remove non-alphabetic characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenize the text
    words = word_tokenize(text)

    # Remove stop words and punctuation
    words = [word for word in words if word not in stop_words and word.isalpha()]

    # Join the remaining words into a string
    text = ' '.join(words)

    return text

# Apply the preprocessing function to the 'Content' column
yfi['Preprocessed'] = yfi['Content'].apply(preprocess_text)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [22]:
yfi.tail()

Unnamed: 0,AuthorID,Author,Date_original,Date,Content,Attachments,Reactions,Preprocessed
13618,1.12535e+17,j_._,2023/9/12 ä¸å12:42,2023/9/12,Why? Discourse is fine,,,discourse fine
13619,4.15472e+17,.drose01,2023/9/18 ä¸å8:29,2023/9/18,Does yearn finance not develop mobile apps?,,,yearn finance develop mobile apps
13620,1.12535e+17,j_._,2023/9/18 ä¸å8:44,2023/9/18,nope. should be responsive on mobile anyway so...,,,nope responsive mobile anyway theres need dedi...
13621,1.12535e+17,j_._,2023/9/18 ä¸å8:44,2023/9/18,"also, wrong channel to ask this",,,also wrong channel ask
13622,4.15472e+17,.drose01,2023/9/18 ä¸å10:07,2023/9/18,I see.,,,see


In [23]:
# Save the cleaned DataFrame to a new CSV file
yfi.to_csv('cleaned_discord_yfi_governance.csv', index=False)

# Sentiment Analysis (VADER)

In [24]:
! pip install vaderSentiment



In [25]:
yfi.dropna(subset=['Date'], inplace=True)

In [26]:
yfi.tail()

Unnamed: 0,AuthorID,Author,Date_original,Date,Content,Attachments,Reactions,Preprocessed
13618,1.12535e+17,j_._,2023/9/12 ä¸å12:42,2023/9/12,Why? Discourse is fine,,,discourse fine
13619,4.15472e+17,.drose01,2023/9/18 ä¸å8:29,2023/9/18,Does yearn finance not develop mobile apps?,,,yearn finance develop mobile apps
13620,1.12535e+17,j_._,2023/9/18 ä¸å8:44,2023/9/18,nope. should be responsive on mobile anyway so...,,,nope responsive mobile anyway theres need dedi...
13621,1.12535e+17,j_._,2023/9/18 ä¸å8:44,2023/9/18,"also, wrong channel to ask this",,,also wrong channel ask
13622,4.15472e+17,.drose01,2023/9/18 ä¸å10:07,2023/9/18,I see.,,,see


In [27]:
YFI = yfi
YFI.head()

Unnamed: 0,AuthorID,Author,Date_original,Date,Content,Attachments,Reactions,Preprocessed
0,1.77762e+17,mrdunkirk,2020/7/20 ä¸å12:26,2020/7/20,https://gov.yearn.finance/t/proposal-0-yfi-sup...,,,
1,1.77762e+17,mrdunkirk,2020/7/20 ä¸å12:26,2020/7/20,https://gov.yearn.finance/t/proposal-1-yfi-fee...,,,
2,1.77762e+17,mrdunkirk,2020/7/20 ä¸å12:26,2020/7/20,https://gov.yearn.finance/t/proposal-5-reducin...,,,
3,6.46663e+17,Cr3ateD#0001,2020/7/20 ä¸å12:27,2020/7/20,"should we, um. fund an audit?",,ð¯ (2),um fund audit
4,1.77762e+17,mrdunkirk,2020/7/20 ä¸å12:27,2020/7/20,i would vote for that,,,would vote


In [28]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Initialize the sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Convert the 'Date' column to a datetime object and set it as the index

YFI['Date'] = pd.to_datetime(YFI['Date'], errors='coerce')
YFI.set_index('Date', inplace=True)

# Define a function to compute the sentiment score for each day
def daily_sentiment_score(text):
    sentiment = analyzer.polarity_scores(text)
    return sentiment['compound']

# Compute the daily sentiment score using resample and apply
yfi_daily = YFI['Preprocessed'].resample('D').apply(lambda x: x.apply(daily_sentiment_score).mean())


In [29]:
yfi_daily

Date
2020-07-20    0.104253
2020-07-21    0.104855
2020-07-22    0.151472
2020-07-23    0.149307
2020-07-24    0.132579
                ...   
2023-09-14         NaN
2023-09-15         NaN
2023-09-16         NaN
2023-09-17         NaN
2023-09-18   -0.063175
Freq: D, Name: Preprocessed, Length: 1156, dtype: float64

In [30]:
yfi_daily_sentiment = yfi_daily.to_frame(name='Sentiment Score').reset_index()

In [31]:
yfi_daily_sentiment.columns

Index(['Date', 'Sentiment Score'], dtype='object')

In [32]:
yfi_daily_sentiment.head()

Unnamed: 0,Date,Sentiment Score
0,2020-07-20,0.104253
1,2020-07-21,0.104855
2,2020-07-22,0.151472
3,2020-07-23,0.149307
4,2020-07-24,0.132579


In [33]:
yfi_daily_sentiment['Sentiment Score'].fillna(0.000000, inplace=True)

In [34]:
yfi_daily_sentiment

Unnamed: 0,Date,Sentiment Score
0,2020-07-20,0.104253
1,2020-07-21,0.104855
2,2020-07-22,0.151472
3,2020-07-23,0.149307
4,2020-07-24,0.132579
...,...,...
1151,2023-09-14,0.000000
1152,2023-09-15,0.000000
1153,2023-09-16,0.000000
1154,2023-09-17,0.000000


In [35]:
yfi_daily_sentiment.to_csv('sentiment_discord_yfi_governance.csv', index=False)

In [36]:
import plotly.graph_objs as go

# Create a line graph using Plotly
fig = go.Figure()

# Add a line trace to the figure
fig.add_trace(go.Scatter(x=yfi_daily_sentiment['Date'], y=yfi_daily_sentiment['Sentiment Score'], mode='lines'))

# Customize the layout of the figure
fig.update_layout(title='yearn.finance  Discord Daily Sentiment Score', xaxis_title='Date', yaxis_title='Sentiment Score')

# Display the figure
fig.show()


In [37]:
positive_count = len(yfi_daily_sentiment[yfi_daily_sentiment['Sentiment Score'] > 0])
negative_count = len(yfi_daily_sentiment[yfi_daily_sentiment['Sentiment Score'] < 0])


In [38]:
import plotly.express as px

# Calculate positive and negative sentiment counts
positive_count = (yfi_daily_sentiment['Sentiment Score'] > 0).sum()
negative_count = (yfi_daily_sentiment['Sentiment Score'] < 0).sum()

# Create a dataframe for histogram
hist_yfi = pd.DataFrame({
    'Sentiment': ['Positive', 'Negative'],
    'Count': [positive_count, negative_count]
})

# Create a plotly histogram
fig = px.histogram(hist_yfi, x='Sentiment', y='Count', color='Sentiment',
                   title='yearn.finance  Discord Sentiment Distribution')

# Set y-axis label
fig.update_layout(yaxis_title='Count (days)')

# Add count number on each bar
fig.update_traces(texttemplate='%{y}', textposition='outside')

fig.show()


# Discord Volume

In [39]:
# Group the discord messages by day and count the number of discord messages in each day
daily_discord_count = YFI['Content'].resample('D').count()

# Print the daily discord message count
print(daily_discord_count)

Date
2020-07-20     934
2020-07-21    1092
2020-07-22    1727
2020-07-23     504
2020-07-24     573
              ... 
2023-09-14       0
2023-09-15       0
2023-09-16       0
2023-09-17       0
2023-09-18       4
Freq: D, Name: Content, Length: 1156, dtype: int64


In [40]:
daily_discord_count = daily_discord_count.to_frame(name='Discord Volume').reset_index()

In [41]:
daily_discord_count

Unnamed: 0,Date,Discord Volume
0,2020-07-20,934
1,2020-07-21,1092
2,2020-07-22,1727
3,2020-07-23,504
4,2020-07-24,573
...,...,...
1151,2023-09-14,0
1152,2023-09-15,0
1153,2023-09-16,0
1154,2023-09-17,0


In [42]:
daily_discord_count.to_csv('discord_volume_yfi_governance.csv', index=False)

In [43]:
import plotly.graph_objs as go

# Create a line graph using Plotly
fig = go.Figure()

# Add a line trace to the figure
fig.add_trace(go.Scatter(x=daily_discord_count['Date'], y=daily_discord_count['Discord Volume'], mode='lines'))

# Customize the layout of the figure
fig.update_layout(title='yearn.finance  Discord Daily Message Volume', xaxis_title='Date', yaxis_title='Discord Message Volume')

# Display the figure
fig.show()
