### Extract meaningful info from json


In [17]:
import json
import pandas as pd
from textblob import TextBlob

In [18]:
data = json.load(open("../data/raw_tweets_full.json", "r"))

In [39]:
def get_polarity(score):
    """ Returns a polarity value based on the polarity score (subjective) 
    :param score: polarity score returned by TextBlob 
    :returns: an associated polarity (Positive, Negative, Neutral) """
    if score <= 0.2:
        return "Negative"
    elif score > 0.2 and score <= 0.7:
        return "Neutral"
    else:
        return "Positive"

In [40]:
def extract_mentions(mentions):
    """ Extract user mentions from a tweet 
    :param mentions: list of user mentions (from Twitter API)
    :returns: comma-separated user name and screen names """
    return ", ".join([e['name'] + " (" + e['screen_name'] + ")" for e in mentions])

In [27]:
def get_readable_date(twitter_date):
    """ Extract the date in a friendlier format (dd/mm/yy) 
    :param twitter_date: date returned by Twitter API 
    :returns: readable date """
    mapped_months = {'Jan': '01', 'Feb': '02', 'Mar': '03', 'Apr': '04', 'May': '05',
                    'Jun': '06', 'Jul': '07', 'Aug': '08', 'Sep': '09', 'Oct': '10',
                    'Nov': '11', 'Dec': '12'}
    split = twitter_date.split(" ")
    month, value, year = mapped_months[split[1]], split[2], split[-1]
    return "/".join([value, month, year])

In [33]:
def extract_info(data):
    """ Main extraction function, returns relevant fields for each tweet returned by the 
        Twitter API
    :param data: json file containing the raw extracted tweets 
    :returns: dataframe of extracted information """
    fields = ['u_name', 'u_screen_name', 't_date', 't_text', 't_polarity_score', 
             't_polarity', 't_subjectivity_score', 'u_followers', 't_retweets', 
              't_favorited', 't_mentions']
    all_extracted = {k: [] for k in fields}
    for tweet in data:
        all_extracted['u_name'].append(tweet['user']['name'])
        all_extracted['u_screen_name'].append(tweet['user']['screen_name'])
        all_extracted['t_date'].append(get_readable_date(tweet['created_at']))
        all_extracted['t_text'].append(tweet['full_text'])
        tb_analysed = TextBlob(tweet['full_text'])
        all_extracted['t_polarity_score'].append(tb_analysed.polarity)
        all_extracted['t_subjectivity_score'].append(tb_analysed.sentiment.subjectivity)
        all_extracted['t_polarity'].append(get_polarity(tb_analysed.polarity))
        all_extracted['u_followers'].append(tweet['user']['followers_count'])
        all_extracted['t_retweets'].append(tweet['retweet_count'])
        all_extracted['t_favorited'].append(tweet['favorite_count'])
        all_extracted['t_mentions'].append(extract_mentions(
            tweet['entities']['user_mentions']))
    return pd.DataFrame(all_extracted)

In [34]:
df = extract_info(data)

In [35]:
df

Unnamed: 0,u_name,u_screen_name,t_date,t_text,t_polarity_score,t_polarity,t_subjectivity_score,u_followers,t_retweets,t_favorited,t_mentions
0,Dr Keith Grimes,keithgrimes,12/03/2019,"@cpeedell @marcus_baw @babylonhealth HI Clive,...",0.450000,Neutral,0.700000,5499,0,1,"Clive Peedell (cpeedell), Dr Marcus Baw (marcu..."
1,Sam Dowling,saminus,12/03/2019,As we near the end of our @LondonwideLMCs annu...,0.200000,Negative,0.341667,1417,2,3,"Londonwide LMCs (LondonwideLMCs), Jackieappleb..."
2,Londonwide LMCs,LondonwideLMCs,12/03/2019,We are now on to our third and final long anti...,-0.012500,Negative,0.350000,2076,1,1,"Elliott (ElliottSinger), Jackieapplebeeturner ..."
3,Mark Barley,Chronotrope,12/03/2019,@DrSelvarajah @DrMurphy11 @babylonhealth I mea...,-0.312500,Negative,0.687500,353,0,0,"Selvaseelan Selvarajah (DrSelvarajah), Dr Murp..."
4,Mark Barley,Chronotrope,12/03/2019,@DrSelvarajah @DrMurphy11 @babylonhealth I’d l...,-0.100000,Negative,0.100000,353,0,1,"Selvaseelan Selvarajah (DrSelvarajah), Dr Murp..."
5,Shaun Lintern,ShaunLintern,12/03/2019,Hosting @babylonhealth GP at Hand app could le...,-0.125000,Negative,0.375000,24674,3,3,"Babylon (babylonhealth), Ben Heather (BHeather..."
6,💣 Medical Devices Expert,JoanneD_,12/03/2019,@Meddev_guy @DrMurphy11 @babylonhealth @MHRAde...,0.000000,Negative,0.000000,466,0,0,"Meddev Guy (Meddev_guy), Dr Murphy (DrMurphy11..."
7,SHugar,SusiHarris,12/03/2019,@vanessafrank3 @kieran_walshe @JonAshworth @ba...,0.700000,Neutral,0.600000,48,0,0,"vanessa frank (vanessafrank3), Kieran Walshe (..."
8,Meddev Guy,Meddev_guy,12/03/2019,@JoanneD_ @DrMurphy11 @babylonhealth @MHRAdevi...,0.000000,Negative,0.000000,8,0,0,"Medical Devices Expert 🎳 (JoanneD_), Dr Murphy..."
9,michelle drage,michelledrage,12/03/2019,@RossLydall @LondonwideLMCs @bengoldacre @baby...,0.800000,Positive,1.000000,3646,0,2,"Ross Lydall (RossLydall), Londonwide LMCs (Lon..."


In [36]:
df.to_csv("../data/dataframe_formatted_results.csv")  # saving to csv