In [1]:
#import libraries
import pandas as pd

# Before we begin, we supress deprecation warnings resulting from nltk on Kaggle
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

In [2]:
#pip install nltk

In [3]:
tweets = pd.read_csv("FINAL_TweetDataset.csv") #read database
list(tweets.columns.values)

['author id',
 'created_at',
 'geo',
 'id',
 'lang',
 'like_count',
 'quote_count',
 'reply_count',
 'retweet_count',
 'source',
 'tweet',
 'location',
 'bbox']

In [4]:
tweets.info() #dataset summary

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 137143 entries, 0 to 137142
Data columns (total 13 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   author id      137143 non-null  int64 
 1   created_at     137143 non-null  object
 2   geo            137143 non-null  object
 3   id             137143 non-null  int64 
 4   lang           137143 non-null  object
 5   like_count     137143 non-null  int64 
 6   quote_count    137143 non-null  int64 
 7   reply_count    137143 non-null  int64 
 8   retweet_count  137143 non-null  int64 
 9   source         137143 non-null  object
 10  tweet          137143 non-null  object
 11  location       137143 non-null  object
 12  bbox           137143 non-null  object
dtypes: int64(6), object(7)
memory usage: 13.6+ MB


In [5]:
tweets.head()

Unnamed: 0,author id,created_at,geo,id,lang,like_count,quote_count,reply_count,retweet_count,source,tweet,location,bbox
0,823596424499724288,2020-01-30 23:58:49+00:00,38d05a66be6d4ee1,1223032854382370816,en,2,0,1,0,Twitter for iPhone,To anyone who came to the Chelmo show\r\n\r\nI...,"Colchester, England","[0.845165, 51.853339, 0.9526169, 51.9240639]"
1,19717371,2020-01-30 23:54:05+00:00,4f854c83732cf4f5,1223031661962702849,en,1,0,0,0,Twitter for iPhone,That vegan KFC burger üçî is looking all the mor...,"Watford, East","[-0.4444586, 51.6322356, -0.340092, 51.703921]"
2,177955303,2020-01-30 23:40:18+00:00,00fd3b1ffb89eb31,1223028192761602054,en,0,0,0,1,Twitter for iPhone,https://t.co/U272ukR1ih present 2019-nCoV seem...,"Salisbury, England","[-1.838006, 51.050942, -1.765956, 51.093695]"
3,1653808938,2020-01-30 23:39:01+00:00,35337df206d94ebc,1223027870076936192,en,2,0,1,1,Twitter for iPhone,11 days with this flu and now secondary infect...,"Newburgh, Scotland","[-3.251026, 56.3458372, -3.2199292, 56.353819]"
4,894305025329115137,2020-01-30 23:36:39+00:00,28679b23ed15b380,1223027274242580481,en,0,0,0,0,Twitter for iPhone,don‚Äôt drink corona if u don‚Äôt want the virus m...,"Belfast, Northern Ireland","[-6.0361161, 54.543241, -5.8207101, 54.6484968]"


In [6]:
import pandas as pd
import re

#Step 1
#Function - removing urls and mentions

def remove_usernames_links(tweet):
        tweet = re.sub('@[^\s]+','',str(tweet))
        tweet = re.sub('http[^\s]+','',str(tweet))
        return tweet
    
tweets['preprocessed_tweet'] = tweets['tweet'].apply(remove_usernames_links)

In [7]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))
wordnet_lemmatizer = WordNetLemmatizer()

#Step 2 
# FUNCTION that performs non alphabet character removals, tokenization, lower casing, stop words removal and lemmatization
def data_preprocessing(tweet):
    only_letters = re.sub(r"(@[A-Za-z0-9_]+)|[^\w\s]|#|http\S+", " ",tweet) #regex to remove non-alphabet character
    tokens = nltk.word_tokenize(only_letters)[2:] #tokenize
    lower_case = [l.lower() for l in tokens] #lowercase
    filtered_result = list(filter(lambda l: l not in stop_words, lower_case))# remove stop words
    lemmas = [wordnet_lemmatizer.lemmatize(t) for t in filtered_result] #lemmatize
    return lemmas

tweets['lemmatized_tweet'] = tweets.preprocessed_tweet.apply(data_preprocessing) #applying "data_preprocessing" function
tweets[['tweet','preprocessed_tweet','lemmatized_tweet']].sample(2)

Unnamed: 0,tweet,preprocessed_tweet,lemmatized_tweet
38252,@ClareAllison111 @chamberlain310 @Swim_England...,But the report you quote DOES say that thi...,"[report, quote, say, due, greater, number, you..."
38642,"Apologies once again for the lack of updates, ...","Apologies once again for the lack of updates, ...","[lack, update, rushed, back, hospital, monday,..."


In [8]:
tweets["preprocessed_tweet"] = tweets["preprocessed_tweet"].str.lower() #lowercase

regex_clean = re.compile(r'[^a-zA-Z\s]', flags=re.IGNORECASE)
tweets["preprocessed_tweet"] = tweets["preprocessed_tweet"].str.replace(regex_clean, '')

# joining tokens for BERT
tweets["preprocessed_tweet"] = tweets["preprocessed_tweet"].apply(lambda x: " ".join([word for word in x.split() if word not in (stop_words)]))

In [9]:
tweets.sample(2)

Unnamed: 0,author id,created_at,geo,id,lang,like_count,quote_count,reply_count,retweet_count,source,tweet,location,bbox,preprocessed_tweet,lemmatized_tweet
115004,149459235,2021-11-14 22:19:54+00:00,7ae9e2f2ff7a87cd,1460009624401285127,en,0,0,0,0,Instagram,SEMPRE CHEROSINHO! üôÉ\r\n#fyp #foryou #viral #b...,"Edinburgh, Scotland","[-3.3285119, 55.894729, -3.077505, 55.991662]",sempre cherosinho fyp foryou viral brasil prav...,"[fyp, foryou, viral, brasil, pravoce, edinburg..."
9931,443162177,2020-03-30 16:07:26+00:00,7d7bdec12d2549d4,1244657495764762626,en,0,0,0,0,Twitter for iPhone,@weston_vivienne @NUH_Infection @IPCT_NUH Than...,"Nottingham, England","[-1.2501363, 52.921473, -1.094396, 53.017965]",thank team microbiology invaluable support,"[team, microbiology, invaluable, support]"


In [10]:
#Step 3

#Remove duplicated tweets
tweets.drop_duplicates(inplace=True, subset="preprocessed_tweet")
tweets.reset_index()

Unnamed: 0,index,author id,created_at,geo,id,lang,like_count,quote_count,reply_count,retweet_count,source,tweet,location,bbox,preprocessed_tweet,lemmatized_tweet
0,0,823596424499724288,2020-01-30 23:58:49+00:00,38d05a66be6d4ee1,1223032854382370816,en,2,0,1,0,Twitter for iPhone,To anyone who came to the Chelmo show\r\n\r\nI...,"Colchester, England","[0.845165, 51.853339, 0.9526169, 51.9240639]",anyone came chelmo show wholeheartedly apologi...,"[came, chelmo, show, wholeheartedly, apologise..."
1,1,19717371,2020-01-30 23:54:05+00:00,4f854c83732cf4f5,1223031661962702849,en,1,0,0,0,Twitter for iPhone,That vegan KFC burger üçî is looking all the mor...,"Watford, East","[-0.4444586, 51.6322356, -0.340092, 51.703921]",vegan kfc burger looking alluring chlorinatedc...,"[kfc, burger, looking, alluring, chlorinatedch..."
2,2,177955303,2020-01-30 23:40:18+00:00,00fd3b1ffb89eb31,1223028192761602054,en,0,0,0,1,Twitter for iPhone,https://t.co/U272ukR1ih present 2019-nCoV seem...,"Salisbury, England","[-1.838006, 51.050942, -1.765956, 51.093695]",present ncov seems highly infectious low morta...,"[ncov, seems, highly, infectious, low, mortali..."
3,3,1653808938,2020-01-30 23:39:01+00:00,35337df206d94ebc,1223027870076936192,en,2,0,1,1,Twitter for iPhone,11 days with this flu and now secondary infect...,"Newburgh, Scotland","[-3.251026, 56.3458372, -3.2199292, 56.353819]",days flu secondary infection manuka honey galo...,"[flu, secondary, infection, manuka, honey, gal..."
4,4,894305025329115137,2020-01-30 23:36:39+00:00,28679b23ed15b380,1223027274242580481,en,0,0,0,0,Twitter for iPhone,don‚Äôt drink corona if u don‚Äôt want the virus m...,"Belfast, Northern Ireland","[-6.0361161, 54.543241, -5.8207101, 54.6484968]",dont drink corona u dont want virus man r peop...,"[drink, corona, u, want, virus, man, r, people..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133149,137135,86938806,2022-05-01 07:00:24+00:00,01c7de39d50eab15,1520659386347184132,en,1,0,1,0,Twitter for iPhone,Petrol prices are so high now pretty soon the ...,"Romford, London","[0.140821, 51.500544, 0.280358, 51.62063]",petrol prices high pretty soon drivers seat ca...,"[high, pretty, soon, driver, seat, car, used, ..."
133150,137136,3403129055,2022-05-01 02:56:52+00:00,53b67b1d1cc81a51,1520598096874516480,en,1,0,1,1,Twitter for iPhone,I just wanna get better now üò≠ This flu has bat...,"Birmingham, England","[-2.033651, 52.381063, -1.74763, 52.60687]",wanna get better flu battered bruh,"[wan, na, get, better, flu, battered, bruh]"
133151,137137,1473117849103253510,2022-05-01 01:34:34+00:00,3eb2c704fe8a50cb,1520577387523620865,en,0,0,0,0,Instagram,"When you're lost, look for a flower üåπüçÉ\r\n\r\n...","City of London, London","[-0.112442, 51.5068, -0.0733794, 51.522161]",youre lost look flower noredmi note pro viral ...,"[lost, look, flower, noredmi, note, 10, pro, v..."
133152,137138,2373862307,2022-05-01 00:20:19+00:00,702af17459231b92,1520558701630922753,en,0,0,0,0,Twitter for iPhone,@Em_Lickspittle @MargeJenny @Kit_Yates_Maths W...,"Brighouse, England","[-1.82498, 53.692956, -1.749516, 53.732703]",careers etc getting th like flu vaccine going ...,"[career, etc, getting, 4th, like, flu, vaccine..."


In [11]:
tweets.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 133154 entries, 0 to 137142
Data columns (total 15 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   author id           133154 non-null  int64 
 1   created_at          133154 non-null  object
 2   geo                 133154 non-null  object
 3   id                  133154 non-null  int64 
 4   lang                133154 non-null  object
 5   like_count          133154 non-null  int64 
 6   quote_count         133154 non-null  int64 
 7   reply_count         133154 non-null  int64 
 8   retweet_count       133154 non-null  int64 
 9   source              133154 non-null  object
 10  tweet               133154 non-null  object
 11  location            133154 non-null  object
 12  bbox                133154 non-null  object
 13  preprocessed_tweet  133154 non-null  object
 14  lemmatized_tweet    133154 non-null  object
dtypes: int64(6), object(9)
memory usage: 16.3+ MB


In [12]:
tweets.to_csv('Processed_TweetDataset.csv', index=False)