In [1]:
!pip install vaderSentiment -q
!pip install praw -q
!pip install transformers -q

[K     |████████████████████████████████| 125 kB 6.6 MB/s 
[K     |████████████████████████████████| 188 kB 5.4 MB/s 
[K     |████████████████████████████████| 54 kB 1.5 MB/s 
[K     |████████████████████████████████| 4.7 MB 7.9 MB/s 
[K     |████████████████████████████████| 101 kB 9.9 MB/s 
[K     |████████████████████████████████| 6.6 MB 33.3 MB/s 
[K     |████████████████████████████████| 596 kB 51.1 MB/s 
[?25h

In [2]:
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 300)
import praw #reddit data api
#import ffn #for loading financial data
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sn
import re #regex
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer #VADER sentiment model
import requests
import json
import csv
import time
import datetime

In [3]:
import tensorflow as tf
from tensorflow import keras
tf.test.gpu_device_name() #run to make sure tensorflow is connected to gpu (if applicable)

''

# positive/negative sentiment analysis using VADER

## get comments from reddit using pushshift and praw

In [4]:
#function to get data from pushshift api
def getPushshiftData(query, after, before, sub):
    url = 'https://api.pushshift.io/reddit/search/submission/?size=1000&after='+str(after)+'&before='+str(before)+'&subreddit='+str(sub)#+'&sort_type=score&sort=desc'
    print(url)
    r = requests.get(url)
    data = json.loads(r.text)
    return data['data']

#get relevant data from data extracted using previous function
def collectSubData(subm):
    subData = [subm['id'], subm['title'], subm['url'], datetime.datetime.fromtimestamp(subm['created_utc']).date()]
    subStats.append(subData)

In [5]:
#Subreddit to query
sub='ubisoft'
#before and after dates
before = "1660592772" #july 10 2020
after = "1483228800" #july 1 2017
#query string
#query = "Daily Discussion Thread"
query = ""
subCount = 0
subStats = []

In [6]:
data = getPushshiftData(query, after, before, sub)
# Will run until all posts have been gathered 
# from the 'after' date up until before date
while len(data) > 0:
    for submission in data:
        collectSubData(submission)
        subCount+=1
    # Calls getPushshiftData() with the created date of the last submission
    print(len(data))
    print(str(datetime.datetime.fromtimestamp(data[-1]['created_utc'])))
    after = data[-1]['created_utc']
    data = getPushshiftData(query, after, before, sub)

https://api.pushshift.io/reddit/search/submission/?size=1000&after=1483228800&before=1660592772&subreddit=ubisoft
250
2017-02-25 11:08:07
https://api.pushshift.io/reddit/search/submission/?size=1000&after=1488020887&before=1660592772&subreddit=ubisoft
250
2017-06-12 22:55:44
https://api.pushshift.io/reddit/search/submission/?size=1000&after=1497308144&before=1660592772&subreddit=ubisoft
250
2017-10-24 20:06:38
https://api.pushshift.io/reddit/search/submission/?size=1000&after=1508875598&before=1660592772&subreddit=ubisoft
250
2018-01-24 15:42:52
https://api.pushshift.io/reddit/search/submission/?size=1000&after=1516808572&before=1660592772&subreddit=ubisoft
250
2018-04-07 15:32:47
https://api.pushshift.io/reddit/search/submission/?size=1000&after=1523115167&before=1660592772&subreddit=ubisoft
248
2019-06-10 00:22:28
https://api.pushshift.io/reddit/search/submission/?size=1000&after=1560126148&before=1660592772&subreddit=ubisoft
250
2019-07-07 12:50:43
https://api.pushshift.io/reddit/se

In [7]:
subStats

[['5lfgrv',
  'Is Steep Good? Should You Buy Steep?!? Is It Boring? My First Ubisoft Review!',
  'https://www.youtube.com/watch?v=pPIFW45gNrI',
  datetime.date(2017, 1, 1)],
 ['5libn5',
  'Uplay updates download unforgivably slow',
  'https://www.reddit.com/r/ubisoft/comments/5libn5/uplay_updates_download_unforgivably_slow/',
  datetime.date(2017, 1, 2)],
 ['5lk54n',
  'Ubisoft chat support is the worst',
  'https://www.reddit.com/r/ubisoft/comments/5lk54n/ubisoft_chat_support_is_the_worst/',
  datetime.date(2017, 1, 2)],
 ['5lmv0b',
  'transcript from the most useless chat support ever. who do i go to about this?',
  'https://www.reddit.com/r/ubisoft/comments/5lmv0b/transcript_from_the_most_useless_chat_support/',
  datetime.date(2017, 1, 2)],
 ['5lmx2q',
  'how do i get in touch to file a complaint about a chat rep',
  'https://www.reddit.com/r/ubisoft/comments/5lmx2q/how_do_i_get_in_touch_to_file_a_complaint_about_a/',
  datetime.date(2017, 1, 2)],
 ['5lrwhk',
  'Only in a Ubi game'

In [8]:
#organize data into dataframe
data={}
ids=[]
titles=[]
urls=[]
dates=[]
flairs=[]
for stat in subStats:
    ids.append(stat[0])
    titles.append(stat[1])
    urls.append(stat[2])
    dates.append(stat[3])
    #flairs.append(stat[4])
data['id']=ids
data['title']=titles
data['url']=urls
data['date']=dates
#data['flair']=flairs
df_1=pd.DataFrame(data)

#df_1=df_1[df_1['flair']=='Daily Discussion']

In [9]:
analyser = SentimentIntensityAnalyzer()

scores=[]
for index, row in df_1.iterrows():
    sentiment_score=0
    try:
        sentiment_score = analyser.polarity_scores(row['title'])['compound']
    except TypeError:
        sentiment_score=0
    
    scores.append(sentiment_score)

df_1['title_vader_compound'] = scores
print(scores)

[0.4843, 0.0, -0.34, -0.0334, -0.296, 0.0, 0.0, -0.4767, -0.34, 0.0, 0.4404, -0.4023, 0.5859, 0.0, 0.0, 0.0, -0.6517, 0.5423, -0.4939, 0.0, 0.0, 0.6166, 0.0, 0.0, -0.4215, -0.6808, 0.4939, 0.4939, 0.0, 0.8212, 0.5106, 0.5202, 0.4019, 0.0, -0.3182, 0.4807, -0.0601, -0.4576, 0.0, -0.4019, 0.0, -0.4939, 0.0, 0.2746, 0.0, 0.4939, -0.5848, -0.5719, 0.25, -0.25, -0.4588, -0.2023, 0.0, 0.0, 0.0, -0.4019, 0.0, -0.4019, 0.0, 0.0, -0.2235, 0.69, 0.7003, 0.0, 0.0, 0.4939, 0.4939, 0.6369, 0.0, 0.0, 0.0, 0.4939, 0.0, 0.0, 0.0, 0.8074, 0.8419, -0.4939, 0.6369, 0.0, 0.0, 0.2075, 0.128, 0.4215, 0.0, 0.4939, 0.0, 0.5106, 0.0, 0.5562, 0.2023, 0.7177, 0.4019, -0.2144, 0.0, 0.0, -0.4939, 0.0, -0.0772, 0.4767, 0.4939, 0.3182, -0.4019, 0.1111, 0.4939, -0.3182, 0.5411, -0.3612, 0.0, -0.3182, -0.4019, 0.3802, -0.3612, 0.0, 0.0, 0.3182, 0.3182, 0.3612, 0.0, 0.0, -0.4404, 0.6588, -0.2263, 0.0, 0.0, -0.3476, 0.0, 0.0, -0.6486, -0.4019, -0.6369, 0.0, -0.3182, 0.0, 0.4588, 0.0, 0.0, 0.0, -0.765, 0.5423, 0.0, 0.0, 

In [10]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from scipy.special import softmax

task='sentiment'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"
tokenizer = AutoTokenizer.from_pretrained(MODEL)

model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.save_pretrained(MODEL)

Downloading config.json:   0%|          | 0.00/747 [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/476M [00:00<?, ?B/s]

In [11]:
from tqdm import tqdm

In [12]:
def calc_scores(text):
  #text = "Good night 😊"
  #text = preprocess(text)
  encoded_input = tokenizer(text, return_tensors='pt')
  output = model(**encoded_input)
  scores = output[0][0].detach().numpy()
  scores = softmax(scores)
  return scores

negatives = []
neutrals = []
positives = []

for index, row in tqdm(df_1.iterrows()):
  scores = calc_scores(row['title'])
  negatives.append(scores[0])
  neutrals.append(scores[1])
  positives.append(scores[2])

df_1['title_roberta_neg'] = negatives
df_1['title_roberta_neu'] = neutrals
df_1['title_roberta_pos'] = positives

18710it [35:52,  8.69it/s]


In [13]:
df_1

Unnamed: 0,id,title,url,date,title_vader_compound,title_roberta_neg,title_roberta_neu,title_roberta_pos
0,5lfgrv,Is Steep Good? Should You Buy Steep?!? Is It B...,https://www.youtube.com/watch?v=pPIFW45gNrI,2017-01-01,0.4843,0.241092,0.644713,0.114195
1,5libn5,Uplay updates download unforgivably slow,https://www.reddit.com/r/ubisoft/comments/5lib...,2017-01-02,0.0000,0.844589,0.136550,0.018861
2,5lk54n,Ubisoft chat support is the worst,https://www.reddit.com/r/ubisoft/comments/5lk5...,2017-01-02,-0.3400,0.979450,0.017870,0.002681
3,5lmv0b,transcript from the most useless chat support ...,https://www.reddit.com/r/ubisoft/comments/5lmv...,2017-01-02,-0.0334,0.964341,0.032044,0.003615
4,5lmx2q,how do i get in touch to file a complaint abou...,https://www.reddit.com/r/ubisoft/comments/5lmx...,2017-01-02,-0.2960,0.321028,0.664232,0.014740
...,...,...,...,...,...,...,...,...
18705,woxrtm,I brought the season pass for watchdogs and go...,https://www.reddit.com/r/ubisoft/comments/woxr...,2022-08-15,0.4215,0.301913,0.633674,0.064413
18706,woxxbn,AC Valhalla Gae Bolg Boss Glitch,https://i.redd.it/kgymez26bvh91.jpg,2022-08-15,0.0000,0.153727,0.812665,0.033608
18707,woyn96,Can someone help me I brought the season pass ...,https://www.reddit.com/r/ubisoft/comments/woyn...,2022-08-15,0.5574,0.458893,0.488142,0.052965
18708,wp0m6y,"20% Code, i got nothing to spend it, so…",https://i.redd.it/4zr7t2m4xvh91.jpg,2022-08-15,0.0000,0.351681,0.590251,0.058069


In [14]:
df_2 = df_1.groupby(by='date', axis=0).mean()
df_2 = df_2[['title_vader_compound', 'title_roberta_neg', 'title_roberta_neu', 'title_roberta_pos']]
df_2

Unnamed: 0_level_0,title_vader_compound,title_roberta_neg,title_roberta_neu,title_roberta_pos
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-01-01,0.484300,0.241092,0.644713,0.114195
2017-01-02,-0.167350,0.777352,0.212674,0.009974
2017-01-03,-0.158900,0.352967,0.593894,0.053139
2017-01-04,-0.170000,0.455775,0.529359,0.014867
2017-01-05,0.440400,0.090196,0.465504,0.444300
...,...,...,...,...
2022-08-11,-0.059487,0.462191,0.369517,0.168292
2022-08-12,-0.068233,0.304001,0.616076,0.079923
2022-08-13,-0.105375,0.472780,0.505323,0.021897
2022-08-14,0.059750,0.305074,0.578955,0.115971


In [15]:
# Traiter Social jour + 1

df_2 = df_2.rename_axis('date').reset_index(level=0)

date_1 = []

for i in range(0, df_2.shape[0]-1):
    # get row contents as series using iloc{]
    # and index position of row
    date_1.append(df_2.iloc[i+1]['date'])

date_1.append(np.nan)

df_2['date_1'] = date_1
df_2["date_1"] = pd.to_datetime(df_2["date_1"], format="%Y-%m-%d")

In [16]:
df_2.to_csv('ubisoft_reddit_sentiment_analysis.csv', index=False)