# *Covid - 19 Twitter Analysis*

In [1]:
# Importing Libraries
import csv
import pandas as pd
import re

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

## *Text Processing*
### Text Processing Steps:
- Converting Upper or title case letters to lower case
- Removing Escape characters such as new line
- Removing Hyperlinks
- Removing Punctations
- Removing Numbers
- Removing stop words (a, an, the etc.)
- Removing profane words if neccessary
- Stemming or lemmatization

In [2]:
# Cleaning lines

# Collects hastags used in tweets
hashtags = dict()
def removing_usernames_and_collecting_hashtags(line):
    data = line.split()
    if len(data) < 1:
        return line
    
    for item in data[1:] if data[0][0] == '@' else data: # Because after spliiting if userid present then it will always be at index 0 and will be key
        if item[0] == '#':
            if item in hashtags:
                hashtags[item] += 1
            else:
                hashtags[item] = 1
        elif item[0] == '@':
            data.remove(item)
    return " ".join(data)

def clean_line(line):
    line = line.lower().encode('ascii', errors = 'ignore').decode() # Converting to lower case will remove emojis [pure text]
    line = " ".join(line.strip().split()) # Split will automatically remove escape characters
    
    # Removing links
    line = re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''', '', line)
    
    # Collecting and removing usernames and hashtags
    line = removing_usernames_and_collecting_hashtags(line)
    
    # Removing Punctuations except @ as only one username will be present
    line = line.translate(str.maketrans('', '', '!"$%&\'()*+,-./:;#<=>?[\\]^_`{|}~'))
    
    # Removing Numbers
    line = re.sub(r'\d+', '', line)
    
    """
    # Removing emojis
    regrex_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030""]+", re.UNICODE)
    
    line = regrex_pattern.sub(r'',line)
    """

    # Removing stopwords
    stopword = stopwords.words('english')
    word_tokens = line.split()
    line_words = [word for word in word_tokens if word not in stopword]
    
    # Removing Profanity Words
    profanity = set()
    with open('Profanity.txt', mode = 'r') as text_profanity:
        reader = text_profanity.readlines()
        for word in reader:
            profanity.add(word.replace('\n', ''))
    line_words = [word for word in line_words if word not in profanity]
    
    # Stemming
    #line_words = [PorterStemmer().stem(word) for word in line_words]
    
    # Lemmatize
    line_words = [WordNetLemmatizer().lemmatize(word, pos = 'v') for word in line_words] # lemmatize parts of speech(pos)
    
    line = " ".join(line_words)
    return line    

In [3]:
# Use this code cell if file is downloaded into machine and dont want to use file on server

# Reading data present in form of tweets in tweets.txt

# Tweet_with_id for tweet analysis
tweet_with_id = dict()

# Tweet with no _id for sentiment and other purposes
tweet_with_noid = []

with open('covid19.txt', mode = 'r', encoding = 'utf8') as text_file:
    reader = text_file.readlines() # Reading Lines
    for line in reader:
        if line == '\n':
            continue
        line = clean_line(line) # Cleaning line by calling function
        if line == '': # If empty string
            continue

        if line[0] == '@': # Getting the user_id of tweet
            words = line.split()
            if len(words) <= 1: # If user only posted link or non string, if len = 1, means only userid present
                continue
            tweet_with_id[words[0]] = line.replace(words[0], '', 1).strip() # Stripping whitespaces in tweets
        else:
            tweet_with_noid.append(line) # No need of stripping

"\n# Reading data present in form of tweets in tweets.txt\n\n# Tweet_with_id for tweet analysis\ntweet_with_id = dict()\n\n# Tweet with no _id for sentiment and other purposes\ntweet_with_noid = []\n\nwith open('covid19.txt', mode = 'r', encoding = 'utf8') as text_file:\n    reader = text_file.readlines() # Reading Lines\n    for line in reader:\n        if line == '\n':\n            continue\n        line = clean_line(line) # Cleaning line by calling function\n        if line == '': # If empty string\n            continue\n\n        if line[0] == '@': # Getting the user_id of tweet\n            words = line.split()\n            if len(words) <= 1: # If user only posted link or non string, if len = 1, means only userid present\n                continue\n            tweet_with_id[words[0]] = line.replace(words[0], '', 1).strip() # Stripping whitespaces in tweets\n        else:\n            tweet_with_noid.append(line) # No need of stripping\n"

In [4]:
"""
# Use this code cell if file available on server and not on machine and properties and name of file match in the code
# Reading data present in form of tweets in tweets.txt

# Tweet_with_id for tweet analysis
tweet_with_id = dict()

# Tweet with no _id for sentiment and other purposes
tweet_with_noid = []

# Collecting zip file from url
from io import BytesIO
from zipfile import ZipFile
from urllib.request import urlopen
# or: requests.get(url).content

resp = urlopen("https://spotleai.sgp1.digitaloceanspaces.com/course/zip/covid19.txt.zip")
zipfile = ZipFile(BytesIO(resp.read()))

for line in zipfile.open('covid19.txt').readlines():
    line = line.decode('utf-8')
    if line == '\n':
        continue
    line = clean_line(line) # Cleaning line by calling function
    if line == '': # If empty string
        continue

    if line[0] == '@': # Getting the user_id of tweet
        words = line.split()
        if len(words) <= 1: # If user only posted link or non string, if len = 1, means only userid present
            continue
        tweet_with_id[words[0]] = line.replace(words[0], '', 1).strip() # Stripping whitespaces in tweets
    else:
        tweet_with_noid.append(line) # No need of stripping
"""

In [5]:
# Number of hashtags
len(hashtags)

8979

In [6]:
# Tweets with userid
len(tweet_with_id.items())

3961

In [7]:
# Tweets with no userid
len(tweet_with_noid)

279724

- ***Strings will not have escape characters***
- ***Strings will not have whitespaces***
- ***Strings will not have any links***
- ***Strings will not have any emojis***
- ***Strings will not have any punctuations except # and @ because we need them later.***
- ***All text processing steps are done.***

In [8]:
# Storing cleaned data into machine
with open('cleaned_data_with_id.csv', 'w', encoding = 'utf8') as filehandle:
    writer = csv.writer(filehandle)
    for key, value in tweet_with_id.items():
        #key = str.encode(key).decode('utf-8')
        #value = str.encode(value).decode('utf-8')
        writer.writerow([key, value])
        

with open('cleaned_data_with_no_id.txt', 'w', encoding = 'utf8') as filehandle:
    filehandle.writelines("%s\n" % place for place in tweet_with_noid)

# Storing hashtags
with open('hashtags.csv', 'w', encoding = 'utf8') as filehandle:
    writer = csv.writer(filehandle)
    for key, value in hashtags.items():
        #key = str.encode(key).decode('utf-8')
        #value = str.encode(value).decode('utf-8')
        writer.writerow([key, value])

In [9]:
# Loading Cleaned Data
df = pd.read_csv('cleaned_data_with_id.csv', header = None, names = ['userid', 'tweet'])

In [10]:
# Sample Data
df.sample(10)

Unnamed: 0,userid,tweet
1802,@michaelcoudrey,proof fauci deep state hat agenda use covid pr...
2458,@uberbratwurst,puzzle isnt wordcorona virus anything common c...
1955,@godrejsecure,first fix product problems customer issue talk...
3323,@lakshmipriya,chamchas know covid stay cant stay lockdown fo...
72,@actornikhil,film satya also get feature cnn ibns greatest ...
1334,@vitalvegas,dr birx say flu call corona wish video ive see...
2135,@marcellelouise,thats fineso corona put whats kill themnot bla...
3806,@mytruthhurts,shorter scooby dont care many people get sick ...
710,@karlibarnett,include entire world look forward whole covid ...
607,@scottpresler,hopefully still enough november unfortunately ...
