In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
%matplotlib inline
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', None)

ValueError: Value must have type '<class 'int'>'

In [None]:
# It requires python 3.8 or higher
!pip3 install git+https://github.com/JustAnotherArchivist/snscrape.git

In [None]:
# It spent 3 hours for scraping more than 180,000 tweets
text_query = "$NFLX"
since_date = "2018-01-01"
until_date = "2022-07-11"
os.system('snscrape --jsonl --since {} twitter-search "{} until:{}"> text-query-tweets.json'.format(since_date, text_query, until_date))

In [None]:
# Reading jason files as dataframes
tweets_df = pd.read_json('text-query-tweets.json', lines=True)
# tweets_df2 = pd.read_json('text-query-tweets2.json', lines=True)
# Merging both dataframes as a single dataframe
# tweets_df = pd.concat([tweets_df1,tweets_df2],ignore_index=True)
# Selecting the important columns only wich are Data,renderContent and Lang
tweets_content = tweets_df.loc[:,['date','renderedContent','lang']]
# Choosing the tweets in english language only
tweets_content = tweets_content[tweets_content['lang']=='en']
# Dropping the lang column
tweets_content.drop("lang",axis=1,inplace=True)
# Download the CSV file result on the current folder.
tweets_content.to_csv('Ntweets.csv',index=False)

In [None]:
!pip install demoji
import demoji
import re
import string
from nltk.corpus import stopwords,wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer
from nltk import pos_tag
import attr
import nltk

In [None]:
# This function is used to pass the POS tage for each word passed through clean_text function
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

In [None]:
# Cleaning tweets
def clean_text(text):
    # Initialization the twitter tokenizer
    tk = TweetTokenizer(preserve_case=False, strip_handles=True,reduce_len=True) 
    # Initialization the lemmatizer
    lemmatizer = WordNetLemmatizer()  
    # Trying to avoid deleting the negative verbs as it affects the meaning of the tweets.
    stop_words = stopwords.words('english') + ["i'll","i'm", "should", "could"]
    negative_verbs = [ "shan't",'shouldn',"shouldn't",'wasn','weren','won','wouldn','aren','couldn','didn','doesn','hadn','hasn','haven','isn','ma','mightn','mustn',"mustn't",'needn',"needn't","wouldn't","won't","weren't","wasn't","couldn","not","nor","no","mightn't","isn't","haven't","hadn't","hasn't","didn't","doesn't","aren't","don't","couldn't","never"]
    stop_words =[word for word in stop_words if word not in negative_verbs ] 
    
    # Lowering tweets
    lower_tweet = text.lower() 
    # Removing hashtag and cashtag symbols
    tweet = re.sub(r"[#$]"," ",lower_tweet)
    # Removing links from tweets
    tweet = re.sub(r"https?:\/\/.*[\r\n]*"," ", tweet)
    # Translating emojies into thier descriptions
    tweet = demoji.replace_with_desc(tweet)
    # removing numerical values
    tweet = re.sub(r"[0-9]|-->","",tweet)
    # Tokenize the tweets by twitter tokenzier.
    tweet = tk.tokenize(tweet)
    # Choosing the words that don't exist in stopwords, thier lengths are more than 2 letters and then lemmatize them.
    tweet = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in tweet if word not in stop_words and word not in string.punctuation and len(word)>2 and "." not in word]
    # return the tokens in one sentence 
    tweet = " ".join(tweet)
    
    return tweet

In [None]:
# Read the scrapped file
tweets = pd.read_csv("Ntweets.csv")
tweets.head()

In [None]:
# Applying text cleaning and then downloading it on the current folder
tweets['cleaned'] = tweets["renderedContent"].apply(lambda row:clean_text(row))
tweets.to_csv("CleanedNTweets.csv",index=False)

In [None]:
!pip install transformers
!pip install transformers[sentencepiece]

In [None]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import numpy as np
from scipy.special import softmax

In [None]:
tweets = pd.read_csv("CleanedNTweets.csv")
tweets.head()

In [None]:
MODEL = f"cardiffnlp/twitter-xlm-roberta-base-sentiment"

tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)

# TF
model = TFAutoModelForSequenceClassification.from_pretrained(MODEL)
model.save_pretrained(MODEL)
tokenizer.save_pretrained(MODEL)

In [None]:
def polarity(text):
    encoded_input = tokenizer(text, return_tensors='tf')
    output = model(encoded_input)
    scores = output[0][0].numpy()
    scores = softmax(scores)
    
    # Print labels and scores
    ranking = np.argsort(scores)
    ranking = ranking[::-1]
    l = config.id2label[ranking[0]]
    plrty = -1 if l == "Negative" else 1 if l == "Positive" else 0 
    s = np.round(float(scores[ranking[0]]), 4)
    return (l,plrty)

In [None]:
# downloading the file after applying sentiment analysis on the current folder
tweets['label'],tweets['Polarity'] = zip(*tweets['cleaned'].apply(lambda txt:polarity(txt)))
tweets.to_csv("polarizedTweets.csv",index=False)

In [None]:
ptweets = pd.read_csv("polarizedTweets.csv")
ptweets.head()

In [None]:
# Extracting the date and polarized values from the previous dataframe
ptweets_df = ptweets.loc[:,["date","Polarity"]]
ptweets_df.head()

In [None]:
# Change the date format to match with the next csv file date format
ptweets_df['date'] =pd.to_datetime(ptweets_df['date'],infer_datetime_format=True)
ptweets_df['date'] =pd.to_datetime(ptweets_df['date'].dt.strftime("%m/%d/%y"))

# Aggregate the tweets polarization by avergae, sum and counts 
Pol_df = pd.DataFrame(ptweets_df.groupby('date')['Polarity'].mean())
Pol_df.rename(columns={"Polarity":"P_mean"},inplace=True)
Pol_df['P_sum'] = ptweets_df.groupby('date')['Polarity'].sum()
Pol_df['twt_count'] = ptweets_df.groupby('date')['Polarity'].count()
Pol_df.head()

In [None]:
# Reading the netflix finance data and preparing it to fit with the polarized values
nflx_df = pd.read_csv("NFLX.csv")
nflx_df.rename(columns={"Date":"date"},inplace=True)
nflx_df['date'] = pd.to_datetime(nflx_df['date'],infer_datetime_format=True)
nflx_df.set_index("date")
# Adding the polarization column in the netflix dataframe.
final_df = nflx_df.join(Pol_df,on='date',how="inner")
final_df.head()

In [None]:
# Downloading the final CSV file that has the finance data and tweets polarizations
final_df.to_csv("FinalNflx.csv",index=False)

In [None]:
df_2018=pd.read_csv('nflx2018-2020.csv')
df_2020=pd.read_csv('nflx2020-2022.csv')
df_all=pd.concat([df_2018,df_2020])
df_all=df_all.reset_index()
df_all.drop('index',inplace=True,axis=1)
df_all

In [None]:
df_all.to_csv('Final_nflx_data_2018-2022',index=False)

In [None]:
import pandas as pd
sss=pd.read_csv('CleanedNTweets.csv')
sss.tail()

In [None]:
import pandas as pd
sss=pd.read_csv('NFLX.csv')
sss.head()

In [None]:
import pandas as pd
sss=pd.read_csv('Final_nflx_data_2018-2022')
sss.head()