In [1]:
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import numpy as np
import nltk
import os

# Ensure NLTK data path and required resources
nltk_data_dir = os.path.expanduser("~/nltk_data")
os.makedirs(nltk_data_dir, exist_ok=True)
if nltk_data_dir not in nltk.data.path:
    nltk.data.path.append(nltk_data_dir)

# Download resources if missing
for pkg in [
    'punkt',
    'averaged_perceptron_tagger',
    'wordnet',
    'stopwords',
    'omw-1.4'
]:
    try:
        nltk.data.find(f"tokenizers/{pkg}" if pkg == 'punkt' else f"taggers/{pkg}" if pkg == 'averaged_perceptron_tagger' else f"corpora/{pkg}")
    except LookupError:
        nltk.download(pkg)


[nltk_data] Downloading package wordnet to
[nltk_data]     /home/trinhthanh2508/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/trinhthanh2508/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
df=pd.read_csv('clean_datasets/clean_all.csv')
df.head()
len(df)

77321

In [3]:
#tokenization
def tweet_token(text):
    words_set=text.str.split()
    tokens=[word for word in words_set]
    return tokens

df['tokens']=tweet_token(df['clean_tweet'])
df.head()

Unnamed: 0,created_at,user_id,username,tweet,place,near,clean_tweet,tokens
0,2021-01-06,293175196,sjtafalla,UK Parliament: 2nd Covid Vaccine should be 21 ...,,London,parliament covid vaccine should days not weeks...,"[parliament, covid, vaccine, should, days, not..."
1,2021-01-06,1591779799,ellieelif,First dose of vaccination üíâ 5/1/2021..to comba...,,London,dose vaccination syringe combating covid,"[dose, vaccination, syringe, combating, covid]"
2,2021-01-06,215143656,danananarama,"Time to forget about #COVID, #Brexit and #Trum...",,London,time forget covid brexit trump sleep listening...,"[time, forget, covid, brexit, trump, sleep, li..."
3,2021-01-06,336462129,veronica_foote_,@doctor_oxford Rachel you absolutely nailed it...,,London,rachel absolutely nailed tonight program conti...,"[rachel, absolutely, nailed, tonight, program,..."
4,2021-01-06,1063705581133934593,5herii,My kids can never say they don‚Äôt wanna do thei...,,London,kids never not wana homework sparkles covid sp...,"[kids, never, not, wana, homework, sparkles, c..."


In [4]:
#lemmatize and stemming
import nltk
word_tokenizer = nltk.tokenize.WhitespaceTokenizer()
word_lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    return [word_lemmatizer.lemmatize(word) for word in word_tokenizer.tokenize(text)]
df['lemma']=df['clean_tweet'].apply(lemmatize_text)
df.head()

Unnamed: 0,created_at,user_id,username,tweet,place,near,clean_tweet,tokens,lemma
0,2021-01-06,293175196,sjtafalla,UK Parliament: 2nd Covid Vaccine should be 21 ...,,London,parliament covid vaccine should days not weeks...,"[parliament, covid, vaccine, should, days, not...","[parliament, covid, vaccine, should, day, not,..."
1,2021-01-06,1591779799,ellieelif,First dose of vaccination üíâ 5/1/2021..to comba...,,London,dose vaccination syringe combating covid,"[dose, vaccination, syringe, combating, covid]","[dose, vaccination, syringe, combating, covid]"
2,2021-01-06,215143656,danananarama,"Time to forget about #COVID, #Brexit and #Trum...",,London,time forget covid brexit trump sleep listening...,"[time, forget, covid, brexit, trump, sleep, li...","[time, forget, covid, brexit, trump, sleep, li..."
3,2021-01-06,336462129,veronica_foote_,@doctor_oxford Rachel you absolutely nailed it...,,London,rachel absolutely nailed tonight program conti...,"[rachel, absolutely, nailed, tonight, program,...","[rachel, absolutely, nailed, tonight, program,..."
4,2021-01-06,1063705581133934593,5herii,My kids can never say they don‚Äôt wanna do thei...,,London,kids never not wana homework sparkles covid sp...,"[kids, never, not, wana, homework, sparkles, c...","[kid, never, not, wana, homework, sparkle, cov..."


In [5]:
#lemma_sentence
df['lemma_sentence'] = df['lemma'].apply(lambda x: ' '.join(x))
df.head()

Unnamed: 0,created_at,user_id,username,tweet,place,near,clean_tweet,tokens,lemma,lemma_sentence
0,2021-01-06,293175196,sjtafalla,UK Parliament: 2nd Covid Vaccine should be 21 ...,,London,parliament covid vaccine should days not weeks...,"[parliament, covid, vaccine, should, days, not...","[parliament, covid, vaccine, should, day, not,...",parliament covid vaccine should day not week c...
1,2021-01-06,1591779799,ellieelif,First dose of vaccination üíâ 5/1/2021..to comba...,,London,dose vaccination syringe combating covid,"[dose, vaccination, syringe, combating, covid]","[dose, vaccination, syringe, combating, covid]",dose vaccination syringe combating covid
2,2021-01-06,215143656,danananarama,"Time to forget about #COVID, #Brexit and #Trum...",,London,time forget covid brexit trump sleep listening...,"[time, forget, covid, brexit, trump, sleep, li...","[time, forget, covid, brexit, trump, sleep, li...",time forget covid brexit trump sleep listening...
3,2021-01-06,336462129,veronica_foote_,@doctor_oxford Rachel you absolutely nailed it...,,London,rachel absolutely nailed tonight program conti...,"[rachel, absolutely, nailed, tonight, program,...","[rachel, absolutely, nailed, tonight, program,...",rachel absolutely nailed tonight program conti...
4,2021-01-06,1063705581133934593,5herii,My kids can never say they don‚Äôt wanna do thei...,,London,kids never not wana homework sparkles covid sp...,"[kids, never, not, wana, homework, sparkles, c...","[kid, never, not, wana, homework, sparkle, cov...",kid never not wana homework sparkle covid spar...


In [6]:
#POS for clean tweets
#reference:
#https://www.machinelearningplus.com/nlp/lemmatization-examples-python/
#https://stackoverflow.com/questions/51267166/lemmatization-pandas-python
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
stem_lemmatizer = WordNetLemmatizer()

# Safe tokenization helper: fall back to simple split if punkt not available
def safe_word_tokenize(text):
    try:
        return word_tokenize(text)
    except LookupError:
        return text.split()

def convert_wordnet_tag(tag):
    if tag and tag.startswith('J'):
        return wordnet.ADJ
    elif tag and tag.startswith('V'):
        return wordnet.VERB
    elif tag and tag.startswith('N'):
        return wordnet.NOUN
    elif tag and tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

def pos_tag_set(tweet):
    # Use safe tokenization and handle tagger availability
    tokens = safe_word_tokenize(tweet)
    try:
        tagged_words = pos_tag(tokens)
    except LookupError:
        # If tagger not available, return tokens with None POS
        tagged_words = [(w, None) for w in tokens]
    new_tag = []
    for word, tag in tagged_words:
        new_tag.append((word, convert_wordnet_tag(tag) if tag else None))
    return new_tag

df['pos_tag'] = df['clean_tweet'].apply(pos_tag_set)
df

Unnamed: 0,created_at,user_id,username,tweet,place,near,clean_tweet,tokens,lemma,lemma_sentence,pos_tag
0,2021-01-06,293175196,sjtafalla,UK Parliament: 2nd Covid Vaccine should be 21 ...,,London,parliament covid vaccine should days not weeks...,"[parliament, covid, vaccine, should, days, not...","[parliament, covid, vaccine, should, day, not,...",parliament covid vaccine should day not week c...,"[(parliament, n), (covid, n), (vaccine, n), (s..."
1,2021-01-06,1591779799,ellieelif,First dose of vaccination üíâ 5/1/2021..to comba...,,London,dose vaccination syringe combating covid,"[dose, vaccination, syringe, combating, covid]","[dose, vaccination, syringe, combating, covid]",dose vaccination syringe combating covid,"[(dose, a), (vaccination, n), (syringe, n), (c..."
2,2021-01-06,215143656,danananarama,"Time to forget about #COVID, #Brexit and #Trum...",,London,time forget covid brexit trump sleep listening...,"[time, forget, covid, brexit, trump, sleep, li...","[time, forget, covid, brexit, trump, sleep, li...",time forget covid brexit trump sleep listening...,"[(time, n), (forget, v), (covid, a), (brexit, ..."
3,2021-01-06,336462129,veronica_foote_,@doctor_oxford Rachel you absolutely nailed it...,,London,rachel absolutely nailed tonight program conti...,"[rachel, absolutely, nailed, tonight, program,...","[rachel, absolutely, nailed, tonight, program,...",rachel absolutely nailed tonight program conti...,"[(rachel, n), (absolutely, r), (nailed, a), (t..."
4,2021-01-06,1063705581133934593,5herii,My kids can never say they don‚Äôt wanna do thei...,,London,kids never not wana homework sparkles covid sp...,"[kids, never, not, wana, homework, sparkles, c...","[kid, never, not, wana, homework, sparkle, cov...",kid never not wana homework sparkle covid spar...,"[(kids, n), (never, r), (not, r), (wana, a), (..."
...,...,...,...,...,...,...,...,...,...,...,...
77316,2021-05-20,1161700993840680961,PhilipCrook9,Very little interest by politicians and media ...,"{'type': 'Feature', 'bbox': [-1.476463, 50.883...","Southampton, England",very politicians media china economy thriving ...,"[very, politicians, media, china, economy, thr...","[very, politician, medium, china, economy, thr...",very politician medium china economy thriving ...,"[(very, r), (politicians, n), (media, n), (chi..."
77317,2021-05-19,146596633,edwardjsault,#BREAKING Health Secretary Matt Hancock has an...,"{'type': 'Feature', 'bbox': [-1.476463, 50.883...","Southampton, England",breaking health secretary matt hancock announc...,"[breaking, health, secretary, matt, hancock, a...","[breaking, health, secretary, matt, hancock, a...",breaking health secretary matt hancock announc...,"[(breaking, v), (health, n), (secretary, n), (..."
77318,2021-05-19,235304684,phoTomics,‚ÄúHundreds of flights full of Indian corona‚Äù co...,"{'type': 'Feature', 'bbox': [-1.476463, 50.883...","Southampton, England",hundreds flights indian corona pretty racist,"[hundreds, flights, indian, corona, pretty, ra...","[hundred, flight, indian, corona, pretty, racist]",hundred flight indian corona pretty racist,"[(hundreds, n), (flights, n), (indian, a), (co..."
77319,2021-05-17,726443182997835778,SarahClift10,Sadiq Khan asks for vaccine blitz in variant-h...,"{'type': 'Feature', 'bbox': [-1.476463, 50.883...","Southampton, England",sadiq khan vaccine blitz variant hit london bo...,"[sadiq, khan, vaccine, blitz, variant, hit, lo...","[sadiq, khan, vaccine, blitz, variant, hit, lo...",sadiq khan vaccine blitz variant hit london bo...,"[(sadiq, n), (khan, n), (vaccine, n), (blitz, ..."


In [7]:
#create lemma sentence with pos-tags
def handle_lemma(pos_tweet):
    lemma_set = " "
    for word, pos in pos_tweet:
        if not pos: 
            lemma = word
            lemma_set = lemma_set + " " + lemma
        else:  
            lemma = stem_lemmatizer.lemmatize(word, pos=pos)
            lemma_set = lemma_set + " " + lemma
    return lemma_set
df['pos_tag'].apply(handle_lemma)
    
df['lemma_sentence(with POS)'] = df['pos_tag'].apply(handle_lemma)
df

Unnamed: 0,created_at,user_id,username,tweet,place,near,clean_tweet,tokens,lemma,lemma_sentence,pos_tag,lemma_sentence(with POS)
0,2021-01-06,293175196,sjtafalla,UK Parliament: 2nd Covid Vaccine should be 21 ...,,London,parliament covid vaccine should days not weeks...,"[parliament, covid, vaccine, should, days, not...","[parliament, covid, vaccine, should, day, not,...",parliament covid vaccine should day not week c...,"[(parliament, n), (covid, n), (vaccine, n), (s...",parliament covid vaccine should day not week...
1,2021-01-06,1591779799,ellieelif,First dose of vaccination üíâ 5/1/2021..to comba...,,London,dose vaccination syringe combating covid,"[dose, vaccination, syringe, combating, covid]","[dose, vaccination, syringe, combating, covid]",dose vaccination syringe combating covid,"[(dose, a), (vaccination, n), (syringe, n), (c...",dose vaccination syringe combat covid
2,2021-01-06,215143656,danananarama,"Time to forget about #COVID, #Brexit and #Trum...",,London,time forget covid brexit trump sleep listening...,"[time, forget, covid, brexit, trump, sleep, li...","[time, forget, covid, brexit, trump, sleep, li...",time forget covid brexit trump sleep listening...,"[(time, n), (forget, v), (covid, a), (brexit, ...",time forget covid brexit trump sleep listeni...
3,2021-01-06,336462129,veronica_foote_,@doctor_oxford Rachel you absolutely nailed it...,,London,rachel absolutely nailed tonight program conti...,"[rachel, absolutely, nailed, tonight, program,...","[rachel, absolutely, nailed, tonight, program,...",rachel absolutely nailed tonight program conti...,"[(rachel, n), (absolutely, r), (nailed, a), (t...",rachel absolutely nailed tonight program con...
4,2021-01-06,1063705581133934593,5herii,My kids can never say they don‚Äôt wanna do thei...,,London,kids never not wana homework sparkles covid sp...,"[kids, never, not, wana, homework, sparkles, c...","[kid, never, not, wana, homework, sparkle, cov...",kid never not wana homework sparkle covid spar...,"[(kids, n), (never, r), (not, r), (wana, a), (...",kid never not wana homework sparkle covid sp...
...,...,...,...,...,...,...,...,...,...,...,...,...
77316,2021-05-20,1161700993840680961,PhilipCrook9,Very little interest by politicians and media ...,"{'type': 'Feature', 'bbox': [-1.476463, 50.883...","Southampton, England",very politicians media china economy thriving ...,"[very, politicians, media, china, economy, thr...","[very, politician, medium, china, economy, thr...",very politician medium china economy thriving ...,"[(very, r), (politicians, n), (media, n), (chi...",very politician medium china economy thrive ...
77317,2021-05-19,146596633,edwardjsault,#BREAKING Health Secretary Matt Hancock has an...,"{'type': 'Feature', 'bbox': [-1.476463, 50.883...","Southampton, England",breaking health secretary matt hancock announc...,"[breaking, health, secretary, matt, hancock, a...","[breaking, health, secretary, matt, hancock, a...",breaking health secretary matt hancock announc...,"[(breaking, v), (health, n), (secretary, n), (...",break health secretary matt hancock announce...
77318,2021-05-19,235304684,phoTomics,‚ÄúHundreds of flights full of Indian corona‚Äù co...,"{'type': 'Feature', 'bbox': [-1.476463, 50.883...","Southampton, England",hundreds flights indian corona pretty racist,"[hundreds, flights, indian, corona, pretty, ra...","[hundred, flight, indian, corona, pretty, racist]",hundred flight indian corona pretty racist,"[(hundreds, n), (flights, n), (indian, a), (co...",hundred flight indian corona pretty racist
77319,2021-05-17,726443182997835778,SarahClift10,Sadiq Khan asks for vaccine blitz in variant-h...,"{'type': 'Feature', 'bbox': [-1.476463, 50.883...","Southampton, England",sadiq khan vaccine blitz variant hit london bo...,"[sadiq, khan, vaccine, blitz, variant, hit, lo...","[sadiq, khan, vaccine, blitz, variant, hit, lo...",sadiq khan vaccine blitz variant hit london bo...,"[(sadiq, n), (khan, n), (vaccine, n), (blitz, ...",sadiq khan vaccine blitz variant hit london ...


In [8]:
#save as csv
df.to_csv('Further_clean_datasets/further_clean_all.csv',index = False, encoding='utf_8_sig')

In [9]:
#split into 3 steps
import datetime
df['created_at'] = pd.to_datetime(df['created_at'] , utc=True).dt.date
further_step_one = df[(df['created_at']) < datetime.date(2021,3,8)].reset_index(drop=True)
print("step_one:",len(further_step_one))

step_one: 29920


In [10]:
further_step_two = df[((df['created_at']) >= datetime.date(2021,3,8)) & ((df['created_at']) < datetime.date(2021,5,17))].reset_index(drop=True)
print("step_two:",len(further_step_two))

step_two: 24682


In [11]:
further_step_three= df[((df['created_at']) >= datetime.date(2021,5,17)) & ((df['created_at']) <= datetime.date(2021,7,18))].reset_index(drop=True)
print("step_three:",len(further_step_three))
len(further_step_one)+len(further_step_two)+len(further_step_three)

step_three: 22719


77321

In [12]:
further_step_one.to_csv('Further_clean_datasets/further_clean_step1.csv',index = False, encoding='utf_8_sig')
further_step_two.to_csv('Further_clean_datasets/further_clean_step2.csv',index = False, encoding='utf_8_sig')
further_step_three.to_csv('Further_clean_datasets/further_clean_step3.csv',index = False, encoding='utf_8_sig')

In [13]:
#sentiwordnet analysis
#reference: https://github.com/harika-bonthu/Lexicon-based-SentimentAnalysis/blob/main/lexicon_based_sentiment_analysis.ipynb
nltk.download('sentiwordnet')
from nltk.corpus import sentiwordnet as swn
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
#Âíå‰πãÂâçÁöÑÊñπÊ≥ïÁªìÊûú ÊúâÂ∑ÆË∑ù

def sentiwordnetanalysis(pos_data):
    sentiment = 0
    tokens_count = 0
    for word, pos in pos_data:
        if not pos:
            continue
        lemma = wordnet_lemmatizer.lemmatize(word, pos=pos)
        if not lemma:
            continue
        
        synsets = wordnet.synsets(lemma, pos=pos)
        if not synsets:
            continue

        synset = synsets[0] #only take the most common meaning
        swn_synset = swn.senti_synset(synset.name())
        sentiment += swn_synset.pos_score() - swn_synset.neg_score()
        tokens_count += 1
    if not tokens_count:
        return 0
    else:
        return sentiment

df['sentiword_analysis']=df['pos_tag'].apply(sentiwordnetanalysis)
df


[nltk_data] Downloading package sentiwordnet to
[nltk_data]     /home/trinhthanh2508/nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!


Unnamed: 0,created_at,user_id,username,tweet,place,near,clean_tweet,tokens,lemma,lemma_sentence,pos_tag,lemma_sentence(with POS),sentiword_analysis
0,2021-01-06,293175196,sjtafalla,UK Parliament: 2nd Covid Vaccine should be 21 ...,,London,parliament covid vaccine should days not weeks...,"[parliament, covid, vaccine, should, days, not...","[parliament, covid, vaccine, should, day, not,...",parliament covid vaccine should day not week c...,"[(parliament, n), (covid, n), (vaccine, n), (s...",parliament covid vaccine should day not week...,-0.625
1,2021-01-06,1591779799,ellieelif,First dose of vaccination üíâ 5/1/2021..to comba...,,London,dose vaccination syringe combating covid,"[dose, vaccination, syringe, combating, covid]","[dose, vaccination, syringe, combating, covid]",dose vaccination syringe combating covid,"[(dose, a), (vaccination, n), (syringe, n), (c...",dose vaccination syringe combat covid,-0.125
2,2021-01-06,215143656,danananarama,"Time to forget about #COVID, #Brexit and #Trum...",,London,time forget covid brexit trump sleep listening...,"[time, forget, covid, brexit, trump, sleep, li...","[time, forget, covid, brexit, trump, sleep, li...",time forget covid brexit trump sleep listening...,"[(time, n), (forget, v), (covid, a), (brexit, ...",time forget covid brexit trump sleep listeni...,0.250
3,2021-01-06,336462129,veronica_foote_,@doctor_oxford Rachel you absolutely nailed it...,,London,rachel absolutely nailed tonight program conti...,"[rachel, absolutely, nailed, tonight, program,...","[rachel, absolutely, nailed, tonight, program,...",rachel absolutely nailed tonight program conti...,"[(rachel, n), (absolutely, r), (nailed, a), (t...",rachel absolutely nailed tonight program con...,0.625
4,2021-01-06,1063705581133934593,5herii,My kids can never say they don‚Äôt wanna do thei...,,London,kids never not wana homework sparkles covid sp...,"[kids, never, not, wana, homework, sparkles, c...","[kid, never, not, wana, homework, sparkle, cov...",kid never not wana homework sparkle covid spar...,"[(kids, n), (never, r), (not, r), (wana, a), (...",kid never not wana homework sparkle covid sp...,-0.875
...,...,...,...,...,...,...,...,...,...,...,...,...,...
77316,2021-05-20,1161700993840680961,PhilipCrook9,Very little interest by politicians and media ...,"{'type': 'Feature', 'bbox': [-1.476463, 50.883...","Southampton, England",very politicians media china economy thriving ...,"[very, politicians, media, china, economy, thr...","[very, politician, medium, china, economy, thr...",very politician medium china economy thriving ...,"[(very, r), (politicians, n), (media, n), (chi...",very politician medium china economy thrive ...,0.250
77317,2021-05-19,146596633,edwardjsault,#BREAKING Health Secretary Matt Hancock has an...,"{'type': 'Feature', 'bbox': [-1.476463, 50.883...","Southampton, England",breaking health secretary matt hancock announc...,"[breaking, health, secretary, matt, hancock, a...","[breaking, health, secretary, matt, hancock, a...",breaking health secretary matt hancock announc...,"[(breaking, v), (health, n), (secretary, n), (...",break health secretary matt hancock announce...,0.750
77318,2021-05-19,235304684,phoTomics,‚ÄúHundreds of flights full of Indian corona‚Äù co...,"{'type': 'Feature', 'bbox': [-1.476463, 50.883...","Southampton, England",hundreds flights indian corona pretty racist,"[hundreds, flights, indian, corona, pretty, ra...","[hundred, flight, indian, corona, pretty, racist]",hundred flight indian corona pretty racist,"[(hundreds, n), (flights, n), (indian, a), (co...",hundred flight indian corona pretty racist,-0.125
77319,2021-05-17,726443182997835778,SarahClift10,Sadiq Khan asks for vaccine blitz in variant-h...,"{'type': 'Feature', 'bbox': [-1.476463, 50.883...","Southampton, England",sadiq khan vaccine blitz variant hit london bo...,"[sadiq, khan, vaccine, blitz, variant, hit, lo...","[sadiq, khan, vaccine, blitz, variant, hit, lo...",sadiq khan vaccine blitz variant hit london bo...,"[(sadiq, n), (khan, n), (vaccine, n), (blitz, ...",sadiq khan vaccine blitz variant hit london ...,0.000


In [14]:
#VADER--low speed--use further clean datasets
sentiment_analyzer = SentimentIntensityAnalyzer()
def vaderSentiment_method(df):
    sentiment_analyzer = SentimentIntensityAnalyzer()
    snt_score = sentiment_analyzer.polarity_scores(df['lemma_sentence(with POS)'])
    return snt_score['compound'] 

df['vader_score'] = df.apply(vaderSentiment_method, axis=1)
df

Unnamed: 0,created_at,user_id,username,tweet,place,near,clean_tweet,tokens,lemma,lemma_sentence,pos_tag,lemma_sentence(with POS),sentiword_analysis,vader_score
0,2021-01-06,293175196,sjtafalla,UK Parliament: 2nd Covid Vaccine should be 21 ...,,London,parliament covid vaccine should days not weeks...,"[parliament, covid, vaccine, should, days, not...","[parliament, covid, vaccine, should, day, not,...",parliament covid vaccine should day not week c...,"[(parliament, n), (covid, n), (vaccine, n), (s...",parliament covid vaccine should day not week...,-0.625,-0.0572
1,2021-01-06,1591779799,ellieelif,First dose of vaccination üíâ 5/1/2021..to comba...,,London,dose vaccination syringe combating covid,"[dose, vaccination, syringe, combating, covid]","[dose, vaccination, syringe, combating, covid]",dose vaccination syringe combating covid,"[(dose, a), (vaccination, n), (syringe, n), (c...",dose vaccination syringe combat covid,-0.125,-0.3400
2,2021-01-06,215143656,danananarama,"Time to forget about #COVID, #Brexit and #Trum...",,London,time forget covid brexit trump sleep listening...,"[time, forget, covid, brexit, trump, sleep, li...","[time, forget, covid, brexit, trump, sleep, li...",time forget covid brexit trump sleep listening...,"[(time, n), (forget, v), (covid, a), (brexit, ...",time forget covid brexit trump sleep listeni...,0.250,-0.2263
3,2021-01-06,336462129,veronica_foote_,@doctor_oxford Rachel you absolutely nailed it...,,London,rachel absolutely nailed tonight program conti...,"[rachel, absolutely, nailed, tonight, program,...","[rachel, absolutely, nailed, tonight, program,...",rachel absolutely nailed tonight program conti...,"[(rachel, n), (absolutely, r), (nailed, a), (t...",rachel absolutely nailed tonight program con...,0.625,0.0000
4,2021-01-06,1063705581133934593,5herii,My kids can never say they don‚Äôt wanna do thei...,,London,kids never not wana homework sparkles covid sp...,"[kids, never, not, wana, homework, sparkles, c...","[kid, never, not, wana, homework, sparkle, cov...",kid never not wana homework sparkle covid spar...,"[(kids, n), (never, r), (not, r), (wana, a), (...",kid never not wana homework sparkle covid sp...,-0.875,0.0634
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77316,2021-05-20,1161700993840680961,PhilipCrook9,Very little interest by politicians and media ...,"{'type': 'Feature', 'bbox': [-1.476463, 50.883...","Southampton, England",very politicians media china economy thriving ...,"[very, politicians, media, china, economy, thr...","[very, politician, medium, china, economy, thr...",very politician medium china economy thriving ...,"[(very, r), (politicians, n), (media, n), (chi...",very politician medium china economy thrive ...,0.250,-0.7485
77317,2021-05-19,146596633,edwardjsault,#BREAKING Health Secretary Matt Hancock has an...,"{'type': 'Feature', 'bbox': [-1.476463, 50.883...","Southampton, England",breaking health secretary matt hancock announc...,"[breaking, health, secretary, matt, hancock, a...","[breaking, health, secretary, matt, hancock, a...",breaking health secretary matt hancock announc...,"[(breaking, v), (health, n), (secretary, n), (...",break health secretary matt hancock announce...,0.750,0.0000
77318,2021-05-19,235304684,phoTomics,‚ÄúHundreds of flights full of Indian corona‚Äù co...,"{'type': 'Feature', 'bbox': [-1.476463, 50.883...","Southampton, England",hundreds flights indian corona pretty racist,"[hundreds, flights, indian, corona, pretty, ra...","[hundred, flight, indian, corona, pretty, racist]",hundred flight indian corona pretty racist,"[(hundreds, n), (flights, n), (indian, a), (co...",hundred flight indian corona pretty racist,-0.125,-0.2023
77319,2021-05-17,726443182997835778,SarahClift10,Sadiq Khan asks for vaccine blitz in variant-h...,"{'type': 'Feature', 'bbox': [-1.476463, 50.883...","Southampton, England",sadiq khan vaccine blitz variant hit london bo...,"[sadiq, khan, vaccine, blitz, variant, hit, lo...","[sadiq, khan, vaccine, blitz, variant, hit, lo...",sadiq khan vaccine blitz variant hit london bo...,"[(sadiq, n), (khan, n), (vaccine, n), (blitz, ...",sadiq khan vaccine blitz variant hit london ...,0.000,0.0000


In [15]:
#Textblob
from textblob import TextBlob

def Polarity_score(tweet):
    return TextBlob(tweet).sentiment.polarity

def Subjectivity_score(tweet):
    return TextBlob(tweet).sentiment.subjectivity

df['textblob_polarity'] = df['lemma_sentence(with POS)'].apply(Polarity_score) 
df

Unnamed: 0,created_at,user_id,username,tweet,place,near,clean_tweet,tokens,lemma,lemma_sentence,pos_tag,lemma_sentence(with POS),sentiword_analysis,vader_score,textblob_polarity
0,2021-01-06,293175196,sjtafalla,UK Parliament: 2nd Covid Vaccine should be 21 ...,,London,parliament covid vaccine should days not weeks...,"[parliament, covid, vaccine, should, days, not...","[parliament, covid, vaccine, should, day, not,...",parliament covid vaccine should day not week c...,"[(parliament, n), (covid, n), (vaccine, n), (s...",parliament covid vaccine should day not week...,-0.625,-0.0572,0.000000
1,2021-01-06,1591779799,ellieelif,First dose of vaccination üíâ 5/1/2021..to comba...,,London,dose vaccination syringe combating covid,"[dose, vaccination, syringe, combating, covid]","[dose, vaccination, syringe, combating, covid]",dose vaccination syringe combating covid,"[(dose, a), (vaccination, n), (syringe, n), (c...",dose vaccination syringe combat covid,-0.125,-0.3400,0.000000
2,2021-01-06,215143656,danananarama,"Time to forget about #COVID, #Brexit and #Trum...",,London,time forget covid brexit trump sleep listening...,"[time, forget, covid, brexit, trump, sleep, li...","[time, forget, covid, brexit, trump, sleep, li...",time forget covid brexit trump sleep listening...,"[(time, n), (forget, v), (covid, a), (brexit, ...",time forget covid brexit trump sleep listeni...,0.250,-0.2263,0.000000
3,2021-01-06,336462129,veronica_foote_,@doctor_oxford Rachel you absolutely nailed it...,,London,rachel absolutely nailed tonight program conti...,"[rachel, absolutely, nailed, tonight, program,...","[rachel, absolutely, nailed, tonight, program,...",rachel absolutely nailed tonight program conti...,"[(rachel, n), (absolutely, r), (nailed, a), (t...",rachel absolutely nailed tonight program con...,0.625,0.0000,0.300000
4,2021-01-06,1063705581133934593,5herii,My kids can never say they don‚Äôt wanna do thei...,,London,kids never not wana homework sparkles covid sp...,"[kids, never, not, wana, homework, sparkles, c...","[kid, never, not, wana, homework, sparkle, cov...",kid never not wana homework sparkle covid spar...,"[(kids, n), (never, r), (not, r), (wana, a), (...",kid never not wana homework sparkle covid sp...,-0.875,0.0634,0.025000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77316,2021-05-20,1161700993840680961,PhilipCrook9,Very little interest by politicians and media ...,"{'type': 'Feature', 'bbox': [-1.476463, 50.883...","Southampton, England",very politicians media china economy thriving ...,"[very, politicians, media, china, economy, thr...","[very, politician, medium, china, economy, thr...",very politician medium china economy thriving ...,"[(very, r), (politicians, n), (media, n), (chi...",very politician medium china economy thrive ...,0.250,-0.7485,-0.005556
77317,2021-05-19,146596633,edwardjsault,#BREAKING Health Secretary Matt Hancock has an...,"{'type': 'Feature', 'bbox': [-1.476463, 50.883...","Southampton, England",breaking health secretary matt hancock announc...,"[breaking, health, secretary, matt, hancock, a...","[breaking, health, secretary, matt, hancock, a...",breaking health secretary matt hancock announc...,"[(breaking, v), (health, n), (secretary, n), (...",break health secretary matt hancock announce...,0.750,0.0000,0.000000
77318,2021-05-19,235304684,phoTomics,‚ÄúHundreds of flights full of Indian corona‚Äù co...,"{'type': 'Feature', 'bbox': [-1.476463, 50.883...","Southampton, England",hundreds flights indian corona pretty racist,"[hundreds, flights, indian, corona, pretty, ra...","[hundred, flight, indian, corona, pretty, racist]",hundred flight indian corona pretty racist,"[(hundreds, n), (flights, n), (indian, a), (co...",hundred flight indian corona pretty racist,-0.125,-0.2023,0.250000
77319,2021-05-17,726443182997835778,SarahClift10,Sadiq Khan asks for vaccine blitz in variant-h...,"{'type': 'Feature', 'bbox': [-1.476463, 50.883...","Southampton, England",sadiq khan vaccine blitz variant hit london bo...,"[sadiq, khan, vaccine, blitz, variant, hit, lo...","[sadiq, khan, vaccine, blitz, variant, hit, lo...",sadiq khan vaccine blitz variant hit london bo...,"[(sadiq, n), (khan, n), (vaccine, n), (blitz, ...",sadiq khan vaccine blitz variant hit london ...,0.000,0.0000,0.000000


In [16]:
##save as csv
df.to_csv('unsupervised_datasets/lexicon_all.csv',index = False, encoding='utf_8_sig')

In [17]:
#split into 3 steps
import datetime
df['created_at'] = pd.to_datetime(df['created_at'] , utc=True).dt.date
lexicon_step_one = df[(df['created_at']) < datetime.date(2021,3,8)].reset_index(drop=True)
print("step_one:",len(lexicon_step_one))

step_one: 29920


In [18]:
lexicon_step_two = df[((df['created_at']) >= datetime.date(2021,3,8)) & ((df['created_at']) < datetime.date(2021,5,17))].reset_index(drop=True)
print("step_two:",len(lexicon_step_two))

step_two: 24682


In [19]:
lexicon_step_three= df[((df['created_at']) >= datetime.date(2021,5,17)) & ((df['created_at']) <= datetime.date(2021,7,18))].reset_index(drop=True)
print("step_three:",len(lexicon_step_three))

step_three: 22719


In [20]:
#merge three datasets
dfs=[lexicon_step_one,lexicon_step_two,lexicon_step_three]
dfs=pd.concat(dfs)
dfs=dfs.reset_index(drop=True)

In [21]:
lexicon_step_one.to_csv('unsupervised_datasets/lexicon_step1.csv',index = False, encoding='utf_8_sig')
lexicon_step_two.to_csv('unsupervised_datasets/lexicon_step2.csv',index = False, encoding='utf_8_sig')
lexicon_step_three.to_csv('unsupervised_datasets/lexicon_step3.csv',index = False, encoding='utf_8_sig')