In [4]:
!pip install TextBlob



In [5]:
import re
import json
import pandas as pd
from textblob import TextBlob, sentiments

In [6]:
def read_json(json_file: str) -> list:
    """
    json file reader to open and read json files into a list
    Args:
    -----
    json_file: str - path of a json file
    Returns
    -------
    length of the json file and a list of json
    """

    tweets_data = []
    for tweets in open(json_file, "r"):
        tweets_data.append(json.loads(tweets))

    return len(tweets_data), tweets_data

In [7]:
class TweetDfExtractor:
    """
    this function will parse tweets json into a pandas dataframe
    Return
    ------
    dataframe
    """

    def __init__(self, tweets_list):
        self.tweets_list = tweets_list

    def find_statuses_count(self) -> list:
        statuses_count = [x["user"]["statuses_count"] for x in self.tweets_list]
        return statuses_count

    def find_full_text(self) -> list:
        texts = []
        for x in self.tweets_list:
            try:
                text = x["retweeted_status"]["extended_tweet"]["full_text"]
            except KeyError:
                text = x["text"]
            texts.append(text)
        return texts

    def find_clean_text(self) -> list:
            clean_text = [re.sub("[^a-zA-Z0-9#@\s’,_]", "", text) for text in self.find_full_text()]
            clean_text = [re.sub("\s+", " ", text) for text in clean_text]
            return clean_text

    def find_sentiments(self, text) -> list:
        def text_category(p):
            """
            converts polarity into sentiment category
            """
            if p > 0:
                return "positive"
            elif p < 0:
                return "negative"
            else:
                return "neutral"

        polarity = [TextBlob(x).polarity for x in text]
        subjectivity = [TextBlob(x).subjectivity for x in text]
        sentiment = [text_category(x) for x in polarity]
        return polarity, subjectivity, sentiment

    def find_created_time(self) -> list:
        created_at = [x["created_at"] for x in self.tweets_list]
        return created_at

    def find_source(self) -> list:
        source = [x["source"] for x in self.tweets_list]
        return source

    def find_screen_name(self) -> list:
        screen_name = [x["user"]["screen_name"] for x in self.tweets_list]
        return screen_name

    def find_followers_count(self) -> list:
        followers_count = [x["user"]["followers_count"] for x in self.tweets_list]
        return followers_count

    def find_friends_count(self) -> list:
        friends_count = [x["user"]["friends_count"] for x in self.tweets_list]
        return friends_count

    def is_sensitive(self) -> list:
        is_sensitive = []
        for x in self.tweets_list:
            try:
                value = x["retweeted_status"]["possibly_sensitive"]
            except KeyError:
                value = None
            is_sensitive.append(value)
        return is_sensitive

    def find_favourite_count(self) -> list:
        favourite_count = []
        for x in self.tweets_list:
            try:
                value = x["retweeted_status"]["favorite_count"]
            except KeyError:
                value = x["favorite_count"]
            favourite_count.append(value)
        return favourite_count

    def find_retweet_count(self) -> list:
        retweet_count = []
        for x in self.tweets_list:
            try:
                value = x["retweeted_status"]["retweet_count"]
            except KeyError:
                value = x["retweet_count"]
            retweet_count.append(value)
        return retweet_count

    def find_hashtags(self) -> list:
        hashtags = []
        for text in self.find_clean_text():
            value = " ".join(re.findall("(#[A-Za-z]+[A-Za-z0-9_-]+)", str(text).lower()))
            if value == "":
                value = " "
            hashtags.append(value)
        return hashtags

    def find_mentions(self) -> list:
        mentions = []
        for text in self.find_clean_text():
            value = " ".join(re.findall("(@[A-Za-z0-9_]+)", str(text).lower()))
            if value == "":
                value = " "
            mentions.append(value)
        return mentions

    def find_location(self) -> list:
        location = []
        for x in self.tweets_list:
            try:
                value = x["user"]["location"]
            except TypeError:
                value = None
            location.append(value)
        return location

    def find_lang(self) -> list:
        lang = [x["lang"] for x in self.tweets_list]
        return lang

    def get_tweet_df(self, save=False) -> pd.DataFrame:
        """required column to be generated you should be creative and add more features"""

        columns = ["created_at", "source", "original_text", "clean_text", "sentiment", "polarity", "subjectivity", "lang", "favorite_count", "retweet_count",
                   "original_author", "followers_count", "friends_count", "possibly_sensitive", "hashtags", "user_mentions", "place"]

        created_at = self.find_created_time()
        source = self.find_source()
        text = self.find_full_text()
        clean_text = self.find_clean_text()
        polarity, subjectivity, sentiment = self.find_sentiments(text)
        lang = self.find_lang()
        fav_count = self.find_favourite_count()
        retweet_count = self.find_retweet_count()
        screen_name = self.find_screen_name()
        follower_count = self.find_followers_count()
        friends_count = self.find_friends_count()
        sensitivity = self.is_sensitive()
        hashtags = self.find_hashtags()
        mentions = self.find_mentions()
        location = self.find_location()
        data = zip(created_at, source, text, clean_text, sentiment, polarity, subjectivity, lang, fav_count, retweet_count,
                   screen_name, follower_count, friends_count, sensitivity, hashtags, mentions, location)
        df = pd.DataFrame(data=data, columns=columns)

        if save:
            df.to_csv("processed_tweet_data.csv", index=False)
            print("File Successfully Saved.!!!")

        return df


In [13]:

if __name__ == "__main__":
    # required column to be generated you should be creative and add more features
    columns = ["created_at", "source", "original_text", "clean_text", "sentiment", "polarity", "subjectivity", "lang", "favorite_count", "retweet_count",
               "original_author", "screen_count", "followers_count", "friends_count", "possibly_sensitive", "hashtags", "user_mentions", "place"]
    _, tweet_list = read_json("D:\work\data\covid19.json")
    tweet = TweetDfExtractor(tweet_list)
    tweet_df = tweet.get_tweet_df()

In [15]:
tweet_df.head()

Unnamed: 0,created_at,source,original_text,clean_text,sentiment,polarity,subjectivity,lang,favorite_count,retweet_count,original_author,followers_count,friends_count,possibly_sensitive,hashtags,user_mentions,place
0,Fri Jun 18 17:55:49 +0000 2021,"<a href=""http://twitter.com/download/iphone"" r...","🚨Africa is ""in the midst of a full-blown third...",Africa is in the midst of a fullblown third wa...,positive,0.166667,0.188889,en,548,612,ketuesriche,551,351,False,,@whoafro @jriggers,Mass
1,Fri Jun 18 17:55:59 +0000 2021,"<a href=""https://mobile.twitter.com"" rel=""nofo...","Dr Moeti is head of WHO in Africa, and one of ...","Dr Moeti is head of WHO in Africa, and one of ...",positive,0.133333,0.455556,en,195,92,Grid1949,66,92,False,,,"Edinburgh, Scotland"
2,Fri Jun 18 17:56:07 +0000 2021,"<a href=""http://twitter.com/download/iphone"" r...",Thank you @research2note for creating this ama...,Thank you @research2note for creating this ama...,positive,0.316667,0.483333,en,2,1,LeeTomlinson8,1195,1176,,#red4research,@research2note @nhsrdforum,
3,Fri Jun 18 17:56:10 +0000 2021,"<a href=""https://mobile.twitter.com"" rel=""nofo...","Former Pfizer VP and Virologist, Dr. Michael Y...","Former Pfizer VP and Virologist, Dr Michael Ye...",positive,0.086111,0.197222,en,1580,899,RIPNY08,2666,2704,False,#covid19,,
4,Fri Jun 18 17:56:20 +0000 2021,"<a href=""http://twitter.com/download/android"" ...",I think it’s important that we don’t sell COVA...,I think it’s important that we don’t sell COVA...,positive,0.28,0.62,en,72,20,pash22,28250,30819,,,@texaschildrens @biological_e,United Kingdom
