In [35]:
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import numpy as np
import nltk
import os

# Ensure NLTK data path and required resources
nltk_data_dir = os.path.expanduser("~/nltk_data")
os.makedirs(nltk_data_dir, exist_ok=True)
if nltk_data_dir not in nltk.data.path:
    nltk.data.path.append(nltk_data_dir)

# Download resources if missing
for pkg in [
    'punkt',
    'averaged_perceptron_tagger',
    'wordnet',
    'stopwords',
    'omw-1.4'
]:
    try:
        nltk.data.find(f"tokenizers/{pkg}" if pkg == 'punkt' else f"taggers/{pkg}" if pkg == 'averaged_perceptron_tagger' else f"corpora/{pkg}")
    except LookupError:
        nltk.download(pkg)


[nltk_data] Downloading package wordnet to
[nltk_data]     /home/trinhthanh2508/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/trinhthanh2508/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [36]:
df=pd.read_csv('clean_datasets/clean_all.csv')
df.head()
len(df)

77245

In [37]:
#tokenization
def tweet_token(text):
    words_set=text.str.split()
    tokens=[word for word in words_set]
    return tokens

df['tokens']=tweet_token(df['clean_tweet'])
df.head()

Unnamed: 0,created_at,user_id,username,tweet,place,near,clean_tweet,tokens
0,2021-01-06,293175196,sjtafalla,UK Parliament: 2nd Covid Vaccine should be 21 ...,,London,parliament days not weeks challenge government...,"[parliament, days, not, weeks, challenge, gove..."
1,2021-01-06,1591779799,ellieelif,First dose of vaccination üíâ 5/1/2021..to comba...,,London,first dose syringe combating,"[first, dose, syringe, combating]"
2,2021-01-06,215143656,danananarama,"Time to forget about #COVID, #Brexit and #Trum...",,London,time forget sleep listening abcgrandstand via ...,"[time, forget, sleep, listening, abcgrandstand..."
3,2021-01-06,336462129,veronica_foote_,@doctor_oxford Rachel you absolutely nailed it...,,London,rachel absolutely nailed tonight throughout pr...,"[rachel, absolutely, nailed, tonight, througho..."
4,2021-01-06,1063705581133934593,5herii,My kids can never say they don‚Äôt wanna do thei...,,London,kids never say not wana homework got sparkles ...,"[kids, never, say, not, wana, homework, got, s..."


In [38]:
#lemmatize and stemming
import nltk
word_tokenizer = nltk.tokenize.WhitespaceTokenizer()
word_lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    return [word_lemmatizer.lemmatize(word) for word in word_tokenizer.tokenize(text)]
df['lemma']=df['clean_tweet'].apply(lemmatize_text)
df.head()

Unnamed: 0,created_at,user_id,username,tweet,place,near,clean_tweet,tokens,lemma
0,2021-01-06,293175196,sjtafalla,UK Parliament: 2nd Covid Vaccine should be 21 ...,,London,parliament days not weeks challenge government...,"[parliament, days, not, weeks, challenge, gove...","[parliament, day, not, week, challenge, govern..."
1,2021-01-06,1591779799,ellieelif,First dose of vaccination üíâ 5/1/2021..to comba...,,London,first dose syringe combating,"[first, dose, syringe, combating]","[first, dose, syringe, combating]"
2,2021-01-06,215143656,danananarama,"Time to forget about #COVID, #Brexit and #Trum...",,London,time forget sleep listening abcgrandstand via ...,"[time, forget, sleep, listening, abcgrandstand...","[time, forget, sleep, listening, abcgrandstand..."
3,2021-01-06,336462129,veronica_foote_,@doctor_oxford Rachel you absolutely nailed it...,,London,rachel absolutely nailed tonight throughout pr...,"[rachel, absolutely, nailed, tonight, througho...","[rachel, absolutely, nailed, tonight, througho..."
4,2021-01-06,1063705581133934593,5herii,My kids can never say they don‚Äôt wanna do thei...,,London,kids never say not wana homework got sparkles ...,"[kids, never, say, not, wana, homework, got, s...","[kid, never, say, not, wana, homework, got, sp..."


In [39]:
#lemma_sentence
df['lemma_sentence'] = df['lemma'].apply(lambda x: ' '.join(x))
df.head()

Unnamed: 0,created_at,user_id,username,tweet,place,near,clean_tweet,tokens,lemma,lemma_sentence
0,2021-01-06,293175196,sjtafalla,UK Parliament: 2nd Covid Vaccine should be 21 ...,,London,parliament days not weeks challenge government...,"[parliament, days, not, weeks, challenge, gove...","[parliament, day, not, week, challenge, govern...",parliament day not week challenge government s...
1,2021-01-06,1591779799,ellieelif,First dose of vaccination üíâ 5/1/2021..to comba...,,London,first dose syringe combating,"[first, dose, syringe, combating]","[first, dose, syringe, combating]",first dose syringe combating
2,2021-01-06,215143656,danananarama,"Time to forget about #COVID, #Brexit and #Trum...",,London,time forget sleep listening abcgrandstand via ...,"[time, forget, sleep, listening, abcgrandstand...","[time, forget, sleep, listening, abcgrandstand...",time forget sleep listening abcgrandstand via ...
3,2021-01-06,336462129,veronica_foote_,@doctor_oxford Rachel you absolutely nailed it...,,London,rachel absolutely nailed tonight throughout pr...,"[rachel, absolutely, nailed, tonight, througho...","[rachel, absolutely, nailed, tonight, througho...",rachel absolutely nailed tonight throughout pr...
4,2021-01-06,1063705581133934593,5herii,My kids can never say they don‚Äôt wanna do thei...,,London,kids never say not wana homework got sparkles ...,"[kids, never, say, not, wana, homework, got, s...","[kid, never, say, not, wana, homework, got, sp...",kid never say not wana homework got sparkle sp...


In [40]:
#POS for clean tweets
#reference:
#https://www.machinelearningplus.com/nlp/lemmatization-examples-python/
#https://stackoverflow.com/questions/51267166/lemmatization-pandas-python
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
stem_lemmatizer = WordNetLemmatizer()

# Safe tokenization helper: fall back to simple split if punkt not available
def safe_word_tokenize(text):
    try:
        return word_tokenize(text)
    except LookupError:
        return text.split()

def convert_wordnet_tag(tag):
    if tag and tag.startswith('J'):
        return wordnet.ADJ
    elif tag and tag.startswith('V'):
        return wordnet.VERB
    elif tag and tag.startswith('N'):
        return wordnet.NOUN
    elif tag and tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

def pos_tag_set(tweet):
    # Use safe tokenization and handle tagger availability
    tokens = safe_word_tokenize(tweet)
    try:
        tagged_words = pos_tag(tokens)
    except LookupError:
        # If tagger not available, return tokens with None POS
        tagged_words = [(w, None) for w in tokens]
    new_tag = []
    for word, tag in tagged_words:
        new_tag.append((word, convert_wordnet_tag(tag) if tag else None))
    return new_tag

df['pos_tag'] = df['clean_tweet'].apply(pos_tag_set)
df

Unnamed: 0,created_at,user_id,username,tweet,place,near,clean_tweet,tokens,lemma,lemma_sentence,pos_tag
0,2021-01-06,293175196,sjtafalla,UK Parliament: 2nd Covid Vaccine should be 21 ...,,London,parliament days not weeks challenge government...,"[parliament, days, not, weeks, challenge, gove...","[parliament, day, not, week, challenge, govern...",parliament day not week challenge government s...,"[(parliament, n), (days, n), (not, r), (weeks,..."
1,2021-01-06,1591779799,ellieelif,First dose of vaccination üíâ 5/1/2021..to comba...,,London,first dose syringe combating,"[first, dose, syringe, combating]","[first, dose, syringe, combating]",first dose syringe combating,"[(first, r), (dose, a), (syringe, n), (combati..."
2,2021-01-06,215143656,danananarama,"Time to forget about #COVID, #Brexit and #Trum...",,London,time forget sleep listening abcgrandstand via ...,"[time, forget, sleep, listening, abcgrandstand...","[time, forget, sleep, listening, abcgrandstand...",time forget sleep listening abcgrandstand via ...,"[(time, n), (forget, v), (sleep, a), (listenin..."
3,2021-01-06,336462129,veronica_foote_,@doctor_oxford Rachel you absolutely nailed it...,,London,rachel absolutely nailed tonight throughout pr...,"[rachel, absolutely, nailed, tonight, througho...","[rachel, absolutely, nailed, tonight, througho...",rachel absolutely nailed tonight throughout pr...,"[(rachel, n), (absolutely, r), (nailed, v), (t..."
4,2021-01-06,1063705581133934593,5herii,My kids can never say they don‚Äôt wanna do thei...,,London,kids never say not wana homework got sparkles ...,"[kids, never, say, not, wana, homework, got, s...","[kid, never, say, not, wana, homework, got, sp...",kid never say not wana homework got sparkle sp...,"[(kids, n), (never, r), (say, v), (not, r), (w..."
...,...,...,...,...,...,...,...,...,...,...,...
77240,2021-05-20,1161700993840680961,PhilipCrook9,Very little interest by politicians and media ...,"{'type': 'Feature', 'bbox': [-1.476463, 50.883...","Southampton, England",little interest politicians media well china e...,"[little, interest, politicians, media, well, c...","[little, interest, politician, medium, well, c...",little interest politician medium well china e...,"[(little, a), (interest, n), (politicians, n),..."
77241,2021-05-19,146596633,edwardjsault,#BREAKING Health Secretary Matt Hancock has an...,"{'type': 'Feature', 'bbox': [-1.476463, 50.883...","Southampton, England",breaking health secretary matt hancock announc...,"[breaking, health, secretary, matt, hancock, a...","[breaking, health, secretary, matt, hancock, a...",breaking health secretary matt hancock announc...,"[(breaking, v), (health, n), (secretary, n), (..."
77242,2021-05-19,235304684,phoTomics,‚ÄúHundreds of flights full of Indian corona‚Äù co...,"{'type': 'Feature', 'bbox': [-1.476463, 50.883...","Southampton, England",hundreds flights full indian corona comes acro...,"[hundreds, flights, full, indian, corona, come...","[hundred, flight, full, indian, corona, come, ...",hundred flight full indian corona come across ...,"[(hundreds, n), (flights, n), (full, a), (indi..."
77243,2021-05-17,726443182997835778,SarahClift10,Sadiq Khan asks for vaccine blitz in variant-h...,"{'type': 'Feature', 'bbox': [-1.476463, 50.883...","Southampton, England",sadiq khan asks blitz hit boroughs,"[sadiq, khan, asks, blitz, hit, boroughs]","[sadiq, khan, asks, blitz, hit, borough]",sadiq khan asks blitz hit borough,"[(sadiq, n), (khan, n), (asks, v), (blitz, r),..."


In [41]:
#create lemma sentence with pos-tags
def handle_lemma(pos_tweet):
    lemma_set = " "
    for word, pos in pos_tweet:
        if not pos: 
            lemma = word
            lemma_set = lemma_set + " " + lemma
        else:  
            lemma = stem_lemmatizer.lemmatize(word, pos=pos)
            lemma_set = lemma_set + " " + lemma
    return lemma_set
df['pos_tag'].apply(handle_lemma)
    
df['lemma_sentence(with POS)'] = df['pos_tag'].apply(handle_lemma)
df

Unnamed: 0,created_at,user_id,username,tweet,place,near,clean_tweet,tokens,lemma,lemma_sentence,pos_tag,lemma_sentence(with POS)
0,2021-01-06,293175196,sjtafalla,UK Parliament: 2nd Covid Vaccine should be 21 ...,,London,parliament days not weeks challenge government...,"[parliament, days, not, weeks, challenge, gove...","[parliament, day, not, week, challenge, govern...",parliament day not week challenge government s...,"[(parliament, n), (days, n), (not, r), (weeks,...",parliament day not week challenge government...
1,2021-01-06,1591779799,ellieelif,First dose of vaccination üíâ 5/1/2021..to comba...,,London,first dose syringe combating,"[first, dose, syringe, combating]","[first, dose, syringe, combating]",first dose syringe combating,"[(first, r), (dose, a), (syringe, n), (combati...",first dose syringe combating
2,2021-01-06,215143656,danananarama,"Time to forget about #COVID, #Brexit and #Trum...",,London,time forget sleep listening abcgrandstand via ...,"[time, forget, sleep, listening, abcgrandstand...","[time, forget, sleep, listening, abcgrandstand...",time forget sleep listening abcgrandstand via ...,"[(time, n), (forget, v), (sleep, a), (listenin...",time forget sleep listen abcgrandstand via a...
3,2021-01-06,336462129,veronica_foote_,@doctor_oxford Rachel you absolutely nailed it...,,London,rachel absolutely nailed tonight throughout pr...,"[rachel, absolutely, nailed, tonight, througho...","[rachel, absolutely, nailed, tonight, througho...",rachel absolutely nailed tonight throughout pr...,"[(rachel, n), (absolutely, r), (nailed, v), (t...",rachel absolutely nail tonight throughout pr...
4,2021-01-06,1063705581133934593,5herii,My kids can never say they don‚Äôt wanna do thei...,,London,kids never say not wana homework got sparkles ...,"[kids, never, say, not, wana, homework, got, s...","[kid, never, say, not, wana, homework, got, sp...",kid never say not wana homework got sparkle sp...,"[(kids, n), (never, r), (say, v), (not, r), (w...",kid never say not wana homework get sparkle ...
...,...,...,...,...,...,...,...,...,...,...,...,...
77240,2021-05-20,1161700993840680961,PhilipCrook9,Very little interest by politicians and media ...,"{'type': 'Feature', 'bbox': [-1.476463, 50.883...","Southampton, England",little interest politicians media well china e...,"[little, interest, politicians, media, well, c...","[little, interest, politician, medium, well, c...",little interest politician medium well china e...,"[(little, a), (interest, n), (politicians, n),...",little interest politician medium well china...
77241,2021-05-19,146596633,edwardjsault,#BREAKING Health Secretary Matt Hancock has an...,"{'type': 'Feature', 'bbox': [-1.476463, 50.883...","Southampton, England",breaking health secretary matt hancock announc...,"[breaking, health, secretary, matt, hancock, a...","[breaking, health, secretary, matt, hancock, a...",breaking health secretary matt hancock announc...,"[(breaking, v), (health, n), (secretary, n), (...",break health secretary matt hancock announce...
77242,2021-05-19,235304684,phoTomics,‚ÄúHundreds of flights full of Indian corona‚Äù co...,"{'type': 'Feature', 'bbox': [-1.476463, 50.883...","Southampton, England",hundreds flights full indian corona comes acro...,"[hundreds, flights, full, indian, corona, come...","[hundred, flight, full, indian, corona, come, ...",hundred flight full indian corona come across ...,"[(hundreds, n), (flights, n), (full, a), (indi...",hundred flight full indian corona come acros...
77243,2021-05-17,726443182997835778,SarahClift10,Sadiq Khan asks for vaccine blitz in variant-h...,"{'type': 'Feature', 'bbox': [-1.476463, 50.883...","Southampton, England",sadiq khan asks blitz hit boroughs,"[sadiq, khan, asks, blitz, hit, boroughs]","[sadiq, khan, asks, blitz, hit, borough]",sadiq khan asks blitz hit borough,"[(sadiq, n), (khan, n), (asks, v), (blitz, r),...",sadiq khan ask blitz hit borough


In [42]:
#save as csv
df.to_csv('Further_clean_datasets/further_clean_all.csv',index = False, encoding='utf_8_sig')

In [43]:
#split into 3 steps
import datetime
df['created_at'] = pd.to_datetime(df['created_at'] , utc=True).dt.date
further_step_one = df[(df['created_at']) < datetime.date(2021,3,8)].reset_index(drop=True)
print("step_one:",len(further_step_one))

step_one: 29882


In [44]:
further_step_two = df[((df['created_at']) >= datetime.date(2021,3,8)) & ((df['created_at']) < datetime.date(2021,5,17))].reset_index(drop=True)
print("step_two:",len(further_step_two))

step_two: 24663


In [45]:
further_step_three= df[((df['created_at']) >= datetime.date(2021,5,17)) & ((df['created_at']) <= datetime.date(2021,7,18))].reset_index(drop=True)
print("step_three:",len(further_step_three))
len(further_step_one)+len(further_step_two)+len(further_step_three)

step_three: 22700


77245

In [46]:
further_step_one.to_csv('Further_clean_datasets/further_clean_step1.csv',index = False, encoding='utf_8_sig')
further_step_two.to_csv('Further_clean_datasets/further_clean_step2.csv',index = False, encoding='utf_8_sig')
further_step_three.to_csv('Further_clean_datasets/further_clean_step3.csv',index = False, encoding='utf_8_sig')

In [47]:
#sentiwordnet analysis
#reference: https://github.com/harika-bonthu/Lexicon-based-SentimentAnalysis/blob/main/lexicon_based_sentiment_analysis.ipynb
nltk.download('sentiwordnet')
from nltk.corpus import sentiwordnet as swn
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
#Âíå‰πãÂâçÁöÑÊñπÊ≥ïÁªìÊûú ÊúâÂ∑ÆË∑ù

def sentiwordnetanalysis(pos_data):
    sentiment = 0
    tokens_count = 0
    for word, pos in pos_data:
        if not pos:
            continue
        lemma = wordnet_lemmatizer.lemmatize(word, pos=pos)
        if not lemma:
            continue
        
        synsets = wordnet.synsets(lemma, pos=pos)
        if not synsets:
            continue

        synset = synsets[0] #only take the most common meaning
        swn_synset = swn.senti_synset(synset.name())
        sentiment += swn_synset.pos_score() - swn_synset.neg_score()
        tokens_count += 1
    if not tokens_count:
        return 0
    else:
        return sentiment

df['sentiword_analysis']=df['pos_tag'].apply(sentiwordnetanalysis)
df


[nltk_data] Downloading package sentiwordnet to
[nltk_data]     /home/trinhthanh2508/nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!


Unnamed: 0,created_at,user_id,username,tweet,place,near,clean_tweet,tokens,lemma,lemma_sentence,pos_tag,lemma_sentence(with POS),sentiword_analysis
0,2021-01-06,293175196,sjtafalla,UK Parliament: 2nd Covid Vaccine should be 21 ...,,London,parliament days not weeks challenge government...,"[parliament, days, not, weeks, challenge, gove...","[parliament, day, not, week, challenge, govern...",parliament day not week challenge government s...,"[(parliament, n), (days, n), (not, r), (weeks,...",parliament day not week challenge government...,-0.625
1,2021-01-06,1591779799,ellieelif,First dose of vaccination üíâ 5/1/2021..to comba...,,London,first dose syringe combating,"[first, dose, syringe, combating]","[first, dose, syringe, combating]",first dose syringe combating,"[(first, r), (dose, a), (syringe, n), (combati...",first dose syringe combating,0.000
2,2021-01-06,215143656,danananarama,"Time to forget about #COVID, #Brexit and #Trum...",,London,time forget sleep listening abcgrandstand via ...,"[time, forget, sleep, listening, abcgrandstand...","[time, forget, sleep, listening, abcgrandstand...",time forget sleep listening abcgrandstand via ...,"[(time, n), (forget, v), (sleep, a), (listenin...",time forget sleep listen abcgrandstand via a...,0.250
3,2021-01-06,336462129,veronica_foote_,@doctor_oxford Rachel you absolutely nailed it...,,London,rachel absolutely nailed tonight throughout pr...,"[rachel, absolutely, nailed, tonight, througho...","[rachel, absolutely, nailed, tonight, througho...",rachel absolutely nailed tonight throughout pr...,"[(rachel, n), (absolutely, r), (nailed, v), (t...",rachel absolutely nail tonight throughout pr...,1.375
4,2021-01-06,1063705581133934593,5herii,My kids can never say they don‚Äôt wanna do thei...,,London,kids never say not wana homework got sparkles ...,"[kids, never, say, not, wana, homework, got, s...","[kid, never, say, not, wana, homework, got, sp...",kid never say not wana homework got sparkle sp...,"[(kids, n), (never, r), (say, v), (not, r), (w...",kid never say not wana homework get sparkle ...,-0.750
...,...,...,...,...,...,...,...,...,...,...,...,...,...
77240,2021-05-20,1161700993840680961,PhilipCrook9,Very little interest by politicians and media ...,"{'type': 'Feature', 'bbox': [-1.476463, 50.883...","Southampton, England",little interest politicians media well china e...,"[little, interest, politicians, media, well, c...","[little, interest, politician, medium, well, c...",little interest politician medium well china e...,"[(little, a), (interest, n), (politicians, n),...",little interest politician medium well china...,0.250
77241,2021-05-19,146596633,edwardjsault,#BREAKING Health Secretary Matt Hancock has an...,"{'type': 'Feature', 'bbox': [-1.476463, 50.883...","Southampton, England",breaking health secretary matt hancock announc...,"[breaking, health, secretary, matt, hancock, a...","[breaking, health, secretary, matt, hancock, a...",breaking health secretary matt hancock announc...,"[(breaking, v), (health, n), (secretary, n), (...",break health secretary matt hancock announce...,0.875
77242,2021-05-19,235304684,phoTomics,‚ÄúHundreds of flights full of Indian corona‚Äù co...,"{'type': 'Feature', 'bbox': [-1.476463, 50.883...","Southampton, England",hundreds flights full indian corona comes acro...,"[hundreds, flights, full, indian, corona, come...","[hundred, flight, full, indian, corona, come, ...",hundred flight full indian corona come across ...,"[(hundreds, n), (flights, n), (full, a), (indi...",hundred flight full indian corona come acros...,0.750
77243,2021-05-17,726443182997835778,SarahClift10,Sadiq Khan asks for vaccine blitz in variant-h...,"{'type': 'Feature', 'bbox': [-1.476463, 50.883...","Southampton, England",sadiq khan asks blitz hit boroughs,"[sadiq, khan, asks, blitz, hit, boroughs]","[sadiq, khan, asks, blitz, hit, borough]",sadiq khan asks blitz hit borough,"[(sadiq, n), (khan, n), (asks, v), (blitz, r),...",sadiq khan ask blitz hit borough,0.000


In [48]:
#VADER--low speed--use further clean datasets
sentiment_analyzer = SentimentIntensityAnalyzer()
def vaderSentiment_method(df):
    sentiment_analyzer = SentimentIntensityAnalyzer()
    snt_score = sentiment_analyzer.polarity_scores(df['lemma_sentence(with POS)'])
    return snt_score['compound'] 

df['vader_score'] = df.apply(vaderSentiment_method, axis=1)
df

Unnamed: 0,created_at,user_id,username,tweet,place,near,clean_tweet,tokens,lemma,lemma_sentence,pos_tag,lemma_sentence(with POS),sentiword_analysis,vader_score
0,2021-01-06,293175196,sjtafalla,UK Parliament: 2nd Covid Vaccine should be 21 ...,,London,parliament days not weeks challenge government...,"[parliament, days, not, weeks, challenge, gove...","[parliament, day, not, week, challenge, govern...",parliament day not week challenge government s...,"[(parliament, n), (days, n), (not, r), (weeks,...",parliament day not week challenge government...,-0.625,-0.0572
1,2021-01-06,1591779799,ellieelif,First dose of vaccination üíâ 5/1/2021..to comba...,,London,first dose syringe combating,"[first, dose, syringe, combating]","[first, dose, syringe, combating]",first dose syringe combating,"[(first, r), (dose, a), (syringe, n), (combati...",first dose syringe combating,0.000,0.0000
2,2021-01-06,215143656,danananarama,"Time to forget about #COVID, #Brexit and #Trum...",,London,time forget sleep listening abcgrandstand via ...,"[time, forget, sleep, listening, abcgrandstand...","[time, forget, sleep, listening, abcgrandstand...",time forget sleep listening abcgrandstand via ...,"[(time, n), (forget, v), (sleep, a), (listenin...",time forget sleep listen abcgrandstand via a...,0.250,-0.2263
3,2021-01-06,336462129,veronica_foote_,@doctor_oxford Rachel you absolutely nailed it...,,London,rachel absolutely nailed tonight throughout pr...,"[rachel, absolutely, nailed, tonight, througho...","[rachel, absolutely, nailed, tonight, througho...",rachel absolutely nailed tonight throughout pr...,"[(rachel, n), (absolutely, r), (nailed, v), (t...",rachel absolutely nail tonight throughout pr...,1.375,0.6124
4,2021-01-06,1063705581133934593,5herii,My kids can never say they don‚Äôt wanna do thei...,,London,kids never say not wana homework got sparkles ...,"[kids, never, say, not, wana, homework, got, s...","[kid, never, say, not, wana, homework, got, sp...",kid never say not wana homework got sparkle sp...,"[(kids, n), (never, r), (say, v), (not, r), (w...",kid never say not wana homework get sparkle ...,-0.750,0.6573
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77240,2021-05-20,1161700993840680961,PhilipCrook9,Very little interest by politicians and media ...,"{'type': 'Feature', 'bbox': [-1.476463, 50.883...","Southampton, England",little interest politicians media well china e...,"[little, interest, politicians, media, well, c...","[little, interest, politician, medium, well, c...",little interest politician medium well china e...,"[(little, a), (interest, n), (politicians, n),...",little interest politician medium well china...,0.250,-0.2484
77241,2021-05-19,146596633,edwardjsault,#BREAKING Health Secretary Matt Hancock has an...,"{'type': 'Feature', 'bbox': [-1.476463, 50.883...","Southampton, England",breaking health secretary matt hancock announc...,"[breaking, health, secretary, matt, hancock, a...","[breaking, health, secretary, matt, hancock, a...",breaking health secretary matt hancock announc...,"[(breaking, v), (health, n), (secretary, n), (...",break health secretary matt hancock announce...,0.875,0.0000
77242,2021-05-19,235304684,phoTomics,‚ÄúHundreds of flights full of Indian corona‚Äù co...,"{'type': 'Feature', 'bbox': [-1.476463, 50.883...","Southampton, England",hundreds flights full indian corona comes acro...,"[hundreds, flights, full, indian, corona, come...","[hundred, flight, full, indian, corona, come, ...",hundred flight full indian corona come across ...,"[(hundreds, n), (flights, n), (full, a), (indi...",hundred flight full indian corona come acros...,0.750,-0.2023
77243,2021-05-17,726443182997835778,SarahClift10,Sadiq Khan asks for vaccine blitz in variant-h...,"{'type': 'Feature', 'bbox': [-1.476463, 50.883...","Southampton, England",sadiq khan asks blitz hit boroughs,"[sadiq, khan, asks, blitz, hit, boroughs]","[sadiq, khan, asks, blitz, hit, borough]",sadiq khan asks blitz hit borough,"[(sadiq, n), (khan, n), (asks, v), (blitz, r),...",sadiq khan ask blitz hit borough,0.000,0.0000


In [49]:
#Textblob
from textblob import TextBlob

def Polarity_score(tweet):
    return TextBlob(tweet).sentiment.polarity

def Subjectivity_score(tweet):
    return TextBlob(tweet).sentiment.subjectivity

df['textblob_polarity'] = df['lemma_sentence(with POS)'].apply(Polarity_score) 
df

Unnamed: 0,created_at,user_id,username,tweet,place,near,clean_tweet,tokens,lemma,lemma_sentence,pos_tag,lemma_sentence(with POS),sentiword_analysis,vader_score,textblob_polarity
0,2021-01-06,293175196,sjtafalla,UK Parliament: 2nd Covid Vaccine should be 21 ...,,London,parliament days not weeks challenge government...,"[parliament, days, not, weeks, challenge, gove...","[parliament, day, not, week, challenge, govern...",parliament day not week challenge government s...,"[(parliament, n), (days, n), (not, r), (weeks,...",parliament day not week challenge government...,-0.625,-0.0572,0.000000
1,2021-01-06,1591779799,ellieelif,First dose of vaccination üíâ 5/1/2021..to comba...,,London,first dose syringe combating,"[first, dose, syringe, combating]","[first, dose, syringe, combating]",first dose syringe combating,"[(first, r), (dose, a), (syringe, n), (combati...",first dose syringe combating,0.000,0.0000,0.250000
2,2021-01-06,215143656,danananarama,"Time to forget about #COVID, #Brexit and #Trum...",,London,time forget sleep listening abcgrandstand via ...,"[time, forget, sleep, listening, abcgrandstand...","[time, forget, sleep, listening, abcgrandstand...",time forget sleep listening abcgrandstand via ...,"[(time, n), (forget, v), (sleep, a), (listenin...",time forget sleep listen abcgrandstand via a...,0.250,-0.2263,0.000000
3,2021-01-06,336462129,veronica_foote_,@doctor_oxford Rachel you absolutely nailed it...,,London,rachel absolutely nailed tonight throughout pr...,"[rachel, absolutely, nailed, tonight, througho...","[rachel, absolutely, nailed, tonight, througho...",rachel absolutely nailed tonight throughout pr...,"[(rachel, n), (absolutely, r), (nailed, v), (t...",rachel absolutely nail tonight throughout pr...,1.375,0.6124,0.300000
4,2021-01-06,1063705581133934593,5herii,My kids can never say they don‚Äôt wanna do thei...,,London,kids never say not wana homework got sparkles ...,"[kids, never, say, not, wana, homework, got, s...","[kid, never, say, not, wana, homework, got, sp...",kid never say not wana homework got sparkle sp...,"[(kids, n), (never, r), (say, v), (not, r), (w...",kid never say not wana homework get sparkle ...,-0.750,0.6573,0.025000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77240,2021-05-20,1161700993840680961,PhilipCrook9,Very little interest by politicians and media ...,"{'type': 'Feature', 'bbox': [-1.476463, 50.883...","Southampton, England",little interest politicians media well china e...,"[little, interest, politicians, media, well, c...","[little, interest, politician, medium, well, c...",little interest politician medium well china e...,"[(little, a), (interest, n), (politicians, n),...",little interest politician medium well china...,0.250,-0.2484,-0.118056
77241,2021-05-19,146596633,edwardjsault,#BREAKING Health Secretary Matt Hancock has an...,"{'type': 'Feature', 'bbox': [-1.476463, 50.883...","Southampton, England",breaking health secretary matt hancock announc...,"[breaking, health, secretary, matt, hancock, a...","[breaking, health, secretary, matt, hancock, a...",breaking health secretary matt hancock announc...,"[(breaking, v), (health, n), (secretary, n), (...",break health secretary matt hancock announce...,0.875,0.0000,0.250000
77242,2021-05-19,235304684,phoTomics,‚ÄúHundreds of flights full of Indian corona‚Äù co...,"{'type': 'Feature', 'bbox': [-1.476463, 50.883...","Southampton, England",hundreds flights full indian corona comes acro...,"[hundreds, flights, full, indian, corona, come...","[hundred, flight, full, indian, corona, come, ...",hundred flight full indian corona come across ...,"[(hundreds, n), (flights, n), (full, a), (indi...",hundred flight full indian corona come acros...,0.750,-0.2023,0.300000
77243,2021-05-17,726443182997835778,SarahClift10,Sadiq Khan asks for vaccine blitz in variant-h...,"{'type': 'Feature', 'bbox': [-1.476463, 50.883...","Southampton, England",sadiq khan asks blitz hit boroughs,"[sadiq, khan, asks, blitz, hit, boroughs]","[sadiq, khan, asks, blitz, hit, borough]",sadiq khan asks blitz hit borough,"[(sadiq, n), (khan, n), (asks, v), (blitz, r),...",sadiq khan ask blitz hit borough,0.000,0.0000,0.000000


In [50]:
##save as csv
df.to_csv('unsupervised_datasets/lexicon_all.csv',index = False, encoding='utf_8_sig')

In [51]:
#split into 3 steps
import datetime
df['created_at'] = pd.to_datetime(df['created_at'] , utc=True).dt.date
lexicon_step_one = df[(df['created_at']) < datetime.date(2021,3,8)].reset_index(drop=True)
print("step_one:",len(lexicon_step_one))

step_one: 29882


In [52]:
lexicon_step_two = df[((df['created_at']) >= datetime.date(2021,3,8)) & ((df['created_at']) < datetime.date(2021,5,17))].reset_index(drop=True)
print("step_two:",len(lexicon_step_two))

step_two: 24663


In [53]:
lexicon_step_three= df[((df['created_at']) >= datetime.date(2021,5,17)) & ((df['created_at']) <= datetime.date(2021,7,18))].reset_index(drop=True)
print("step_three:",len(lexicon_step_three))

step_three: 22700


In [54]:
#merge three datasets
dfs=[lexicon_step_one,lexicon_step_two,lexicon_step_three]
dfs=pd.concat(dfs)
dfs=dfs.reset_index(drop=True)

In [55]:
lexicon_step_one.to_csv('unsupervised_datasets/lexicon_step1.csv',index = False, encoding='utf_8_sig')
lexicon_step_two.to_csv('unsupervised_datasets/lexicon_step2.csv',index = False, encoding='utf_8_sig')
lexicon_step_three.to_csv('unsupervised_datasets/lexicon_step3.csv',index = False, encoding='utf_8_sig')