In [111]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
df = pd.read_csv('https://query.data.world/s/5xzuftmozqteqbqzopbcgxbjzusyi7')

In [112]:
#split the training set and test set with test_size = 0.4
def split_train_test(df):
    train_set, test_set = train_test_split(df, test_size=0.4, random_state=42)
    return train_set, test_set

In [113]:
#reomve pattern function, input_text is the text we want to process, the pattern is the pattern we want to remove
def remove_pattern(input_text, pattern):
    r = re.findall(pattern, input_text)
    for i in r:
        input_text = re.sub(i, '', input_text)
    return input_text

In [114]:
emoji = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)

In [115]:
#preprocess the data
def preprocess(df):
    train_set,test_set = split_train_test(df)
    #combine the train_set and test_set to preprocess the data
    combi = pd.concat([train_set,test_set], ignore_index=True)
    
    #remove the @
    combi['processed'] = np.vectorize(remove_pattern)(combi['content'], "@[\w]*")
    
    #remove the #
    combi['processed'] = np.vectorize(remove_pattern)(combi['processed'], "#")
    
    #remove emoji
    combi['processed'] = np.vectorize(remove_pattern)(combi['processed'], emoji)
    
    #remove the special characters, numbers, punctuations
    combi['processed'] = combi['processed'].str.replace("[^a-zA-Z#]", " ",regex=True)
    
     #remove the words smaller than 2
    combi['processed'] = combi['processed'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>2]))
    
    return combi

In [91]:
def drop_sentiment(combi):
    drop_index = combi[(combi['sentiment']=='empty') | (combi['sentiment']=='enthusiasm') | (combi['sentiment']=='fun')].index
    combi = combi.drop(drop_index)
    return combi

In [92]:
#load csv file
df = pd.read_csv('https://query.data.world/s/5xzuftmozqteqbqzopbcgxbjzusyi7')

In [103]:
df_processed = preprocess(df)
df_processed = drop_sentiment(df_processed)
df_processed

Unnamed: 0,tweet_id,sentiment,author,content,processed
0,1694088996,love,whyamievenhere,@zoeatthedisco lol hell yes i'm keen. WE'RE GO...,lol hell yes keen GOING SKIING TREBLE CONE SOM...
1,1751715234,love,theamericanxp,Feeling special @ looking4him first guy to giv...,Feeling special looking him first guy give flo...
3,1694272881,neutral,akane_takamura,?and make your own pledge while you're at it!,and make your own pledge while you
4,1753094560,worry,CVJason,@mosdefaqueen My pleasure. I can't be with my ...,pleasure can with mom Mother Day But can sprea...
5,1961438666,worry,santoleto,"6:29 pm - ok, let's go now through #bowman #st...",let now through bowman strategicClock but firs...
...,...,...,...,...,...
39995,1957212416,neutral,summatusmentis,"really, realy want a netbook. mrr. Techno-lust...",really realy want netbook mrr Techno lust sucks
39996,1751921888,happiness,GaByDiAz,@ Butlers watching Dr. Farmer rock out w/ the ...,Butlers watching Farmer rock out the hispanic ...
39997,1695220613,surprise,jesssicababesss,"@iyaitssuzanne ohh yeh , but he was on sexy me...",ohh yeh but was sexy men its okay
39998,1963521553,worry,jerrybruno,Just found about ten typos in one story in the...,Just found about ten typos one story the Plain...


In [104]:
#The sentiment list
senti_list = df_processed['sentiment'].unique().tolist()
senti_list

['love',
 'neutral',
 'worry',
 'happiness',
 'hate',
 'sadness',
 'surprise',
 'relief',
 'boredom',
 'anger']

In [105]:
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\11580\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [106]:
#using nltk to get the CountVectorizer and frequency dict of given sentiment
def word_token(df_processed, sentiment):
    text = []
    freq_text = []
    stop = set(stopwords.words('english'))
    import nltk
    from sklearn.feature_extraction.text import CountVectorizer
    for str in df_processed[(df_processed['sentiment']==sentiment)]['processed']:
        freq_text.extend([x.lower() for x in str.split(' ') if x not in stop])
        text.append(str.lower())

    cv = CountVectorizer(max_df=0.9, min_df=2, stop_words='english')
    cv.fit_transform(text)
    from nltk import FreqDist
    fdist = FreqDist(freq_text)
    sorted(fdist.items(), key=lambda kv: kv[1], reverse=True)
    return cv, sorted(fdist.items(), key=lambda kv: kv[1], reverse=True)

In [107]:
#get love cv and fdist
cv_love, fdist_love = word_token(df_processed, 'love')
#get other sentiments' cv and fdist
cv_neutral, fdist_neutral = word_token(df_processed, 'neutral')
cv_worry, fdist_worry = word_token(df_processed, 'worry')
cv_happy, fdist_happy = word_token(df_processed, 'happiness')
cv_hate, fdist_hate = word_token(df_processed, 'hate')
cv_sadness, fdist_sadness = word_token(df_processed, 'sadness')
cv_surprise, fdist_surprise = word_token(df_processed, 'surprise')
cv_relief, fdist_relief = word_token(df_processed, 'relief')
cv_born, fdist_born = word_token(df_processed, 'boredom')
cv_anger, fdist_anger = word_token(df_processed, 'anger')

In [108]:
import tweepy
from pandas import Series,DataFrame
import pandas as pd

#Put your Bearer Token in the parenthesis below
client = tweepy.Client(bearer_token='AAAAAAAAAAAAAAAAAAAAAHiaigEAAAAA1hzpI79DUjAi9q8PvGD7lfTzWjQ%3Dw9PNTdJGBWQOWxJjY9l5yP1Z7fglAF5SQGbLN6LrzUHQ5Gvbkd')

# Get tweets that contain the hashtag #petday
# -is:retweet means I don't want retweets
# lang:en is asking for the tweets to be in english
query = 'covid -is:retweet'
tweets = client.search_recent_tweets(query = query, tweet_fields=['context_annotations', 'created_at'], max_results=100)

tweet_lst = [tweet.text for tweet in tweets.data]



data = {'content': tweet_lst}
tweet_df = DataFrame(data)

    

In [109]:
tweet_df

Unnamed: 0,content
0,"A grande mídia está inventando um ""suposto"" su..."
1,@poetWOAgun How much proof does America AND th...
2,Wujudkan Masyarakat Yang Sehat dan Produktif D...
3,https://t.co/2vo3iF4r2h The Unforgivable Reque...
4,@MariaBiddle9 @LeahLschkupz @BoSnerdley Well t...
...,...
94,@DollyD2022 The covid Vax causes spires in the...
95,✅ Moderate Republican \n✅ Female\n✅ White\n✅ P...
96,@Den_1er Ni pour Poutine ni pour Zelensky et i...
97,@CovergirlChase @frandrescher @RockmondDunbar ...


In [116]:
#get processed tweet
tweet_df_processed = preprocess(tweet_df)
tweet_df_processed

Unnamed: 0,content,processed
0,@abcnews Without MSM including ABC 24/7 spewin...,Without MSM including ABC spewing propaganda a...
1,https://t.co/hbMth4sgVT,https hbMth sgVT
2,Erectile Dysfunction Almost 6x More Likely Aft...,Erectile Dysfunction Almost More Likely After ...
3,#NeverForget #NoAmnesty for what they did duri...,NeverForget NoAmnesty for what they did during...
4,https://t.co/9bpfvWMRiZ Susah bernafas serta d...,https bpfvWMRiZ Susah bernafas serta dada tera...
...,...,...
94,Production Hit At World’s Largest iPhone Facto...,Production Hit World Largest iPhone Factory Af...
95,@JordanSchachtel ALL official truths are false...,ALL official truths are false Moon landing Ken...
96,@AquiAnitters @dayligharcher @felipeneto Aí o ...,risco cada pessoa vai avaliar hora fazer meme ...
97,"""Unhinged hysteria"" is the key. Think COVID di...",Unhinged hysteria the key Think COVID discours...
