In [1]:
import pandas as pd
import numpy as np
import re
from googlemaps import googlemaps
from tqdm import tqdm

## Data Visualisation Assignment 3 - Twitter Data
Student Id: 17230755
#### Data Preprocessing

In [4]:
# Location finder
def location(Address):
    '''This function is used to get the most accurate latitude and longitude for a given address. 
    However, if no address found it will writen nan integer type'''
    Address = str(Address)
    Address = re.sub('[^a-zA-Z0-9 ,]+', '', Address)
    gmaps = googlemaps.Client(key='YourKey')
    
    #The below line combines the park name, Area and Dublin to find the reasonable accurate results using geopy (google)
    if 'Ireland' not in Address:
        Address = Address+' '+'Ireland'
    
    try:
        loc = gmaps.geocode(Address)[0]['geometry']['location'] # This fetches the latitude and longitude given the address
        
        return loc['lat'], loc['lng']
    except:
        return np.nan, np.nan # This block return Na value if geo finder does not return any value


In [5]:
def dataPreProcessing():
    tweets = pd.read_csv("TwitterData.csv", encoding='ISO-8859-1', index_col=0)
    userInfo = pd.read_csv("UserInfo.csv", encoding='ISO-8859-1', index_col=0)
    userInfo.dropna(subset = ['location'], inplace=True)
    new_df = pd.merge(tweets, userInfo, on='screenName')
    df = new_df[['text', 'created_x', 'id_x', 'id_y','description', 'name', 'screenName', 'retweetCount', 
                 'friendsCount','followersCount', 'location', 'latitude', 'longitude', 'statusSource', 'verified']]
    df = df[df.location.str.contains("London") == False]
    df = df[df.location.str.contains("Amsterdam, NL") == False]
    
        
    for index, row in tqdm(df.iterrows()):
        if np.isnan(row['latitude']) and np.isnan(row['longitude']):
            lat, lng = location(row['location'])
            df.loc[[index],'latitude'] = lat
            df.loc[[index],'longitude'] = lng

    
    df.dropna(subset = ['latitude'], inplace=True)
    return df


df = dataPreProcessing()

489it [01:49,  4.46it/s]


In [6]:
def finalProcessing():
    x = df.drop_duplicates('screenName')
    y = dict(df.groupby('screenName')['retweetCount'].sum())
    z = pd.DataFrame(list(y.items()), columns=['screenName', 'tweetCount'])
    final = pd.merge(x, z, on='screenName')[['text', 'created_x', 'id_x', 'id_y','description', 'name', 'screenName', 'tweetCount', 
                     'friendsCount','followersCount', 'location', 'latitude', 'longitude', 'statusSource', 'verified']]
    return final

final = finalProcessing()


In [7]:
li = final.created_x
dt = []
tim = []
for i in range(len(li)):
    x = li[i].split(' ')
    dt.append(x[0])
    tim.append(x[1])

tim = pd.Series(tim)
dt = pd.Series(dt)
final['Date'] = dt.values
final['Time'] = tim.values

In [8]:
tweetsPerDay = dict(final.groupby('Date')['tweetCount'].sum())
tweetsPerDay = pd.DataFrame(list(tweetsPerDay.items()), columns=['dt', 'TotCount'])

In [22]:
final.to_csv('tweets.csv', index=False)
tweetsPerDay.to_csv('tweetsPerDay.csv', index=False)

In [54]:
poptweets = dict(df[['text', 'retweetCount']].groupby('text')['retweetCount'].sum())
poptweets = pd.DataFrame(list(poptweets.items()), columns=['text', 'Popularity']).sort_values('Popularity', ascending=False)[0:20]
poptweets.to_csv('poptweets.csv', index=False)

In [17]:
fianl = pd.read_csv('tweets.csv', encoding='ISO-8859-1')
t = fianl

In [18]:
for index, row in t.iterrows():
    if "donegal" in row['location'].lower():
        t.loc[[index],'location'] = "donegal"
    elif "galway" in row['location'].lower():
         t.loc[[index],'location'] = "galway"
    elif "carlow" in row['location'].lower():
         t.loc[[index],'location'] = "carlow" 
    elif "limerick" in row['location'].lower():
         t.loc[[index],'location'] = "limerick"
    elif "dublin" in row['location'].lower():
         t.loc[[index],'location'] = "dublin"
    elif "sligo" in row['location'].lower():
         t.loc[[index],'location'] = "sligo"
    elif "tipperary" in row['location'].lower():
         t.loc[[index],'location'] = "tipperary"
    elif "clare" in row['location'].lower():
         t.loc[[index],'location'] = "clare"
    elif "Wicklow" in row['location'].lower():
         t.loc[[index],'location'] = "Wicklow"
    elif "cork" in row['location'].lower():
         t.loc[[index],'location'] = "cork"
    elif "kerry" in row['location'].lower():
         t.loc[[index],'location'] = "kerry"
    elif "fingal" in row['location'].lower():
         t.loc[[index],'location'] = "fingal"
    else:
        t.loc[[index],'location'] = "others"

In [15]:
t = pd.DataFrame(t.groupby('location').size())

In [12]:
t = pd.read_csv('count.csv')

In [102]:
t['Counts'] = t['Counts'].apply(lambda x: x*100/202)

In [16]:
t.to_csv('count.csv')

In [48]:
final = pd.read_csv('tweets.csv', encoding='ISO-8859-1')

In [7]:
focount = final[['screenName', 'followersCount']]

In [9]:
focount.columns = ['name', 'weight']

In [14]:
focount.to_csv('followers.csv', index=False)

In [61]:
top = final.groupby('text').agg({'tweetCount':'sum', 'latitude':'first', 'longitude':'first', 'text':'first'})
top = top.sort_values('tweetCount', ascending=False)[0:50]

In [63]:
top.to_csv('top50tweets.csv', index=False)

In [22]:
t = pd.DataFrame(t.groupby(['Date', 'location'])['tweetCount'].sum())
t.to_csv('count.csv')

In [23]:
t

Unnamed: 0_level_0,Unnamed: 1_level_0,tweetCount
Date,location,Unnamed: 2_level_1
2018-03-07,carlow,5
2018-03-07,clare,36
2018-03-07,cork,5
2018-03-07,dublin,162
2018-03-07,fingal,0
2018-03-07,galway,0
2018-03-07,others,164
2018-03-08,cork,11
2018-03-08,dublin,502
2018-03-08,galway,36
