In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk import WordNetLemmatizer
import re

In [2]:
tweets_df = pd.read_csv('data/raw_data.csv')
tweets_df.head()

Unnamed: 0,datetime,tweet_id,text,username,like_count,display_name,lang
0,2022-07-15 23:56:53+00:00,1548094282900221953,The government is considering creating a secon...,TimesLIVE,85,Times LIVE,en
1,2022-07-15 23:54:59+00:00,1548093802857934849,@lebomosebetsi Selo se se dirwang ke @Eskom_SA...,Dan24Stock,0,𝗗𝗮𝗻𝗶𝗲𝗹 🇿🇦 𝗦𝘁𝗼𝗰𝗸.,tl
2,2022-07-15 23:50:41+00:00,1548092722166132738,@nickhedley The only way to fix the current en...,KingTNgema,0,Born A King,en
3,2022-07-15 23:43:00+00:00,1548090785450434566,@Pastorransley He has awarded himself. \n\nHe...,Constitution_94,0,Constitution First 🇿🇦,en
4,2022-07-15 23:40:14+00:00,1548090089334382596,Also @Eskom_SA ???,katleho_katlii,0,Katleho. 🍫,en


In [3]:
tweets_df.shape

(310690, 7)

In [4]:
# Remove non-English tweets
tweets_df = tweets_df[tweets_df['lang']=='en']
tweets_df.shape

(266468, 7)

In [5]:
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('omw-1.4')

In [6]:
STOPWORDS = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

In [7]:
def clean_text(text, lower=True, lemm=True, tokenization=False, stopwords=STOPWORDS):
    '''Clean raw text'''
    # Lowercase
    if lower:
        text = text.lower()
        
    # Remove stopwords
    if len(stopwords):
        pattern = re.compile(r'\b(' + r'|'.join(stopwords) + r')\b\s*')
        text = pattern.sub('', text)
    
    # Remove mentions
    text = re.sub(r'@[A-Za-z0-9_]+', '', text)
    
    # Remove links
    text = re.sub(r'http\S+', '', text)
    
    # Remove & sign
    text = re.sub(r'&amp+', '', text)
    
    # Spacing and filters
    text = re.sub(r"([!\''#$%&()*\+,-./:;<=>?\\\[\]^_`{|}~])", r' \1 ', text)
    text = re.sub('[^A-Za-z0-9]+', ' ', text)
    text = re.sub(' +', ' ', text)
    text = text.strip()
    
    # lemmatization
    if lemm:
        text = ' '.join([lemmatizer.lemmatize(word) for word in text.split(' ')])
        
    if tokenization:
        text = re.split('\W+', text)
        
    return text

In [8]:
# Run tweets through cleaning function
tweets_df['text'] = tweets_df['text'].apply(clean_text)
tweets_df.head()

Unnamed: 0,datetime,tweet_id,text,username,like_count,display_name,lang
0,2022-07-15 23:56:53+00:00,1548094282900221953,government considering creating second state o...,TimesLIVE,85,Times LIVE,en
2,2022-07-15 23:50:41+00:00,1548092722166132738,way fix current energy crisis splitting eskom ...,KingTNgema,0,Born A King,en
3,2022-07-15 23:43:00+00:00,1548090785450434566,awarded pushed zuma brian molefe guptas mine g...,Constitution_94,0,Constitution First 🇿🇦,en
4,2022-07-15 23:40:14+00:00,1548090089334382596,also,katleho_katlii,0,Katleho. 🍫,en
5,2022-07-15 23:39:22+00:00,1548089872287555584,use gold reserve sort eskom sa debt market fac...,mdange39,0,Tupac,en


In [9]:
# Remove tweets with 3 or less words
tweets_df = tweets_df[tweets_df['text'].str.split(' ').str.len() > 3]
tweets_df.shape

(232743, 7)

In [10]:
tweets_df.to_csv('data/tweets_cleaned.csv', index=False)