In [10]:
pip install geopandas

Note: you may need to restart the kernel to use updated packages.


In [11]:
'''
INSTALL THE DEPENDENCIES
:Pandas - It is a fast, powerful, flexible and easy to use open source data analysis and manipulation tool
:Textblob -  It provides a simple API for diving into common natural language processing (NLP) tasks such sentiment analysis
:Re - The functions in this module let you check if a particular string matches a given pattern
:Geopandas - An open source project to make working with geospatial data in python easier.
:Matplotlib - Its a plotting library
:Shapely - It generates the polygon points from a pair of lat longs to plot in a world map
'''

import geopandas as gpd
import pandas as pd
from textblob import TextBlob
import textblob.download_corpora
import re
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import requests
import plotly.express as px
from shapely.geometry import mapping, shape
from shapely.prepared import prep
from shapely.geometry import Point

country_iso_code_mapping = {'unknown':'WLD','aruba': 'ABW', 'afghanistan': 'AFG', 'angola': 'AGO', 'albania': 'ALB', 'andorra': 'AND', 'united arab emirates': 'ARE', 'argentina': 'ARG', 'armenia': 'ARM', 'american samoa': 'ASM', 'antigua and barbuda': 'ATG', 'australia': 'AUS', 'austria': 'AUT', 'azerbaijan': 'AZE', 'burundi': 'BDI', 'belgium': 'BEL', 'benin': 'BEN', 'burkina faso': 'BFA', 'bangladesh': 'BGD', 'bulgaria': 'BGR', 'bahrain': 'BHR', 'bahamas, the': 'BHS', 'bosnia and herzegovina': 'BIH', 'belarus': 'BLR', 'belize': 'BLZ', 'bermuda': 'BMU', 'bolivia': 'BOL', 'brazil': 'BRA', 'barbados': 'BRB', 'brunei darussalam': 'BRN', 'bhutan': 'BTN', 'botswana': 'BWA', 'canada': 'CAN', 'switzerland': 'CHE', 'chile': 'CHL', 'china': 'CHN', "cote d'ivoire": 'CIV', 'cameroon': 'CMR', 'congo, dem. rep.': 'COD', 'congo, rep.': 'COG', 'colombia': 'COL', 'comoros': 'COM', 'cabo verde': 'CPV', 'costa rica': 'CRI', 'cuba': 'CUB', 'curacao': 'CUW', 'cayman islands': 'CYM', 'cyprus': 'CYP', 'czech republic': 'CZE', 'germany': 'DEU', 'djibouti': 'DJI', 'dominica': 'DMA', 'denmark': 'DNK', 'dominican republic': 'DOM', 'algeria': 'DZA', 'ecuador': 'ECU', 'egypt, arab rep.': 'EGY', 'eritrea': 'ERI', 'spain': 'ESP', 'estonia': 'EST', 'ethiopia': 'ETH', 'finland': 'FIN', 'fiji': 'FJI', 'france': 'FRA', 'faroe islands': 'FRO', 'gabon': 'GAB', 'united kingdom': 'GBR', 'georgia': 'GEO', 'ghana': 'GHA', 'gibraltar': 'GIB', 'guinea': 'GIN', 'gambia, the': 'GMB', 'guinea-bissau': 'GNB', 'greece': 'GRC', 'grenada': 'GRD', 'greenland': 'GRL', 'guatemala': 'GTM', 'guam': 'GUM', 'guyana': 'GUY', 'hong kong sar, china': 'HKG', 'honduras': 'HND', 'croatia': 'HRV', 'haiti': 'HTI', 'hungary': 'HUN', 'indonesia': 'IDN', 'india': 'IND', 'ireland': 'IRL', 'iran, islamic rep.': 'IRN', 'iraq': 'IRQ', 'iceland': 'ISL', 'israel': 'ISR', 'italy': 'ITA', 'jamaica': 'JAM', 'jordan': 'JOR', 'japan': 'JPN', 'kazakhstan': 'KAZ', 'kenya': 'KEN', 'kyrgyz republic': 'KGZ', 'cambodia': 'KHM', 'kiribati': 'KIR', 'st. kitts and nevis': 'KNA', 'korea, rep.': 'KOR', 'kuwait': 'KWT', 'lao pdr': 'LAO', 'lebanon': 'LBN', 'liberia': 'LBR', 'libya': 'LBY', 'st. lucia': 'LCA', 'liechtenstein': 'LIE', 'sri lanka': 'LKA', 'lesotho': 'LSO', 'lithuania': 'LTU', 'luxembourg': 'LUX', 'latvia': 'LVA', 'morocco': 'MAR', 'monaco': 'MCO', 'moldova': 'MDA', 'madagascar': 'MDG', 'maldives': 'MDV', 'mexico': 'MEX', 'marshall islands': 'MHL', 'north macedonia': 'MKD', 'mali': 'MLI', 'malta': 'MLT', 'myanmar': 'MMR', 'montenegro': 'MNE', 'mongolia': 'MNG', 'northern mariana islands': 'MNP', 'mozambique': 'MOZ', 'mauritania': 'MRT', 'mauritius': 'MUS', 'malawi': 'MWI', 'malaysia': 'MYS', 'namibia': 'NAM', 'new caledonia': 'NCL', 'niger': 'NER', 'nigeria': 'NGA', 'nicaragua': 'NIC', 'netherlands': 'NLD', 'norway': 'NOR', 'nepal': 'NPL', 'nauru': 'NRU', 'new zealand': 'NZL', 'oman': 'OMN', 'pakistan': 'PAK', 'panama': 'PAN', 'peru': 'PER', 'philippines': 'PHL', 'palau': 'PLW', 'papua new guinea': 'PNG', 'poland': 'POL', 'puerto rico': 'PRI', "korea, dem. people's rep.": 'PRK', 'portugal': 'PRT', 'paraguay': 'PRY', 'french polynesia': 'PYF', 'qatar': 'QAT', 'romania': 'ROU', 'russian federation': 'RUS', 'rwanda': 'RWA', 'saudi arabia': 'SAU', 'sudan': 'SDN', 'senegal': 'SEN', 'singapore': 'SGP', 'solomon islands': 'SLB', 'sierra leone': 'SLE', 'el salvador': 'SLV', 'san marino': 'SMR', 'somalia': 'SOM', 'serbia': 'SRB', 'sao tome and principe': 'STP', 'suriname': 'SUR', 'slovak republic': 'SVK', 'slovenia': 'SVN', 'sweden': 'SWE', 'eswatini': 'SWZ', 'seychelles': 'SYC', 'syrian arab republic': 'SYR', 'turks and caicos islands': 'TCA', 'chad': 'TCD', 'togo': 'TGO', 'thailand': 'THA', 'tajikistan': 'TJK', 'turkmenistan': 'TKM', 'tonga': 'TON', 'trinidad and tobago': 'TTO', 'tunisia': 'TUN', 'turkey': 'TUR', 'tuvalu': 'TUV', 'tanzania': 'TZA', 'uganda': 'UGA', 'ukraine': 'UKR', 'uruguay': 'URY', 'united states': 'USA', 'uzbekistan': 'UZB', 'st. vincent and the grenadines': 'VCT', 'venezuela, rb': 'VEN', 'british virgin islands': 'VGB', 'virgin islands (u.s.)': 'VIR', 'vietnam': 'VNM', 'vanuatu': 'VUT', 'samoa': 'WSM', 'kosovo': 'XKX', 'yemen, rep.': 'YEM', 'south africa': 'ZAF', 'zambia': 'ZMB', 'zimbabwe': 'ZWE', 'united states of america': 'USA'}
'''
USING THE GEOJSON - 
Using the lat long we will find the country name that will be used in plotting choropleth map
'''
data = requests.get("https://raw.githubusercontent.com/datasets/geo-countries/master/data/countries.geojson").json()
countries = {}
for feature in data["features"]:
    geom = feature["geometry"]
    country = feature["properties"]["ADMIN"]
    countries[country] = prep(shape(geom))

def get_country(lon, lat):
    point = Point(lon, lat)
    for country in countries:
        geom = countries.get(country)
        if geom.contains(point):
            return country
    return "unknown"

'''
Task 2.1: Data pre-processing
        : Drop the rows with no lat-long information available
        : Removing #,*,&,^ any other special characters and urls from the tweets
        
'''
##Read the DATASET csv file
df = pd.read_csv('CIS7030_S1_21_Task2_Data.csv')
##Consider only data rows that has LAT & LONG values available
df = df.dropna(subset=['lat',
       'long'])
df = df.reset_index(drop=True)

country_names_from_lat_long = []
country_codes = []
for i in df.index:
  lat = df['lat'][i]
  lon = df['long'][i]
  try:
    country_name = get_country(lon,lat)
    if country_name =='unknown':
      country_code = 'WLD'
      if df['country'][i]=='United States of America':
        country_name = 'United States of America'
        country_code = 'USA'
      country_codes.append(country_code)
      country_names_from_lat_long.append(country_name)

    else:
      country_names_from_lat_long.append(country_name)
      country_code = country_iso_code_mapping.get(country_name.lower())
      country_codes.append(country_code)
  except Exception as e:
    country_names_from_lat_long.append('unknown')
    country_code.append('WLD')
    pass
df['country_from_lat_long'] = country_names_from_lat_long
df['country_code'] = country_codes

##Method to remove the special characters and urls from the tweets
def filter_special_chars_url(text):
    try:
      ##This regular expression removes the special characters from the text string
      text = re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])",' ',text)
      ##This regular expression remvoes url from the text
      text = re.sub("(\w+:\ / \ / \S+)", ' ', text)
      text = text.split()
      text = ' '.join(text)
    except Exception as e:
      pass
    return text

df['tweet'] = df['tweet'].apply(lambda x: filter_special_chars_url(x))

###Now the dataset tweets are pre-processed and can be used in further tasks


In [12]:
'''
Task 2.2: Polarity analysis
        : Find polarity of all tweets
        : Find average of polarity scores for tweets from a same location
        : Plot the results into a visualisation
'''
##Textblob :https://textblob.readthedocs.io/en/dev/
df['polarity_score'] = df['tweet'].apply(lambda x: TextBlob(x).sentiment.polarity)
##Calculating the average of polarity score of tweets from a same location
##For USA locations Only
usa_df = df.groupby(['state_code'], as_index=False)['polarity_score'].mean()
##For whole World
polarity_location_df = df.groupby(['country_from_lat_long','country_code'], as_index=False)['polarity_score'].mean()
# print (polarity_location_df)
colors = list()
for i in polarity_location_df.index:
  score = polarity_location_df['polarity_score'][i]
  if score>0:
    colors.append('green')
  else:
    colors.append('red')

##Plotting the calculated results
fig1 = go.Figure(data=[go.Bar(
    x=polarity_location_df['country_from_lat_long'],
       
    y=polarity_location_df['polarity_score'],
   
    marker_color=colors
)])
fig1.update_layout(title_text='Polarity Analysis', yaxis_title = 'Polarity Score Mean',)

### Polrity : Polarity tells us whether the opinions formed by people about a particular thing is positive, negtaive or neutral ###
### Here you can see from graph that countries like Georgia and Mongolia have extremely high polarity while countries like Spain and Costa Rica have extremely negative polarity values###



In [13]:
##Choropleth plot for polarity score mean on world map
fig1 = go.Figure(data=go.Choropleth(
    locations = polarity_location_df['country_code'],
    z = polarity_location_df['polarity_score'],
    text = polarity_location_df['country_from_lat_long'],
    colorscale = 'thermal',
    autocolorscale=False,
    reversescale=True,
    marker_line_color='black',
    marker_line_width=0.5,
    colorbar_tickprefix = '',
    colorbar_title = 'Polarity Score Average',
))
fig1.update_layout(
    title_text='Polarity Analysis',
    geo=dict(
        showframe=False,
        showcoastlines=False,
        projection_type='equirectangular'
    ),
    annotations = [dict(
        x=0.55,
        y=0.1,
        xref='paper',
        yref='paper',
        showarrow = False
    )]
)

In [14]:
###POLARITY ANALYSIS STATE WISE- UNITED STATES OF AMERICA

fig = go.Figure(data=go.Choropleth(
    locations=usa_df['state_code'], # Spatial coordinates
    z = usa_df['polarity_score'], # Data to be color-coded
    locationmode = 'USA-states', # set of locations match entries in `locations`
    colorscale = 'agsunset',
    colorbar_title = "Polarity Average",
))

fig.update_layout(
    title_text = 'US State Wise Polarity Mean Average',
    geo_scope='usa', # limite map scope to USA
)

## If we see the tweets polarity within USA, then the eastern region of USA has more negative polarity as compared to western region. The polarity average is highest in the south and north of America ###
## Minnesota state has the highest positive polarity average of 0.3 while Arkansas has highest negative polarity average of -0.181 amongst all states ## 

In [15]:
'''
Task 2.3: Subjectivity analysis
        : Find subjectiveness of all tweets
        : Find average of subjective scores for tweets from a same country
        : Plot the results into a visualisation
'''

df['subjectivity_score'] = df['tweet'].apply(lambda x: TextBlob(x).sentiment.subjectivity)
##Calculating the average of subjectivity score of tweets from a same location
usa_df_2 = df.groupby(['state_code'], as_index=False)['subjectivity_score'].mean()
subjectivity_location_df = df.groupby(['country_from_lat_long','country_code'], as_index=False)['subjectivity_score'].mean()
colors2 = list()
for i in subjectivity_location_df.index:
  score = subjectivity_location_df['subjectivity_score'][i]
  if score>0:
    colors2.append('green')
  else:
    # print (score)
    colors2.append('red')

fig2 = go.Figure(data=[go.Bar(
    x=subjectivity_location_df['country_from_lat_long'],
       
    y=subjectivity_location_df['subjectivity_score'],
   
    marker_color=colors2
)])
fig2.update_layout(title_text='Average Subjectivity Analysis', yaxis_title = 'Subjectivity Score Mean')

## Subjectivity : It analyses the text (opinions) of tweets and tells us how much of that opinion is based on factual data and how much is the person's personal opinion in that tweet ##
## Georgia and New Zealand have the highest average subjectivity while Japan and UK have the lowest average subjectivity scores amongst the given countries ##

In [16]:
##Choropleth plot for subjective score mean 
fig1 = go.Figure(data=go.Choropleth(
    locations = subjectivity_location_df['country_code'],
    z = subjectivity_location_df['subjectivity_score'],
    text = subjectivity_location_df['country_from_lat_long'],
    colorscale = 'thermal',
    autocolorscale=False,
    reversescale=True,
    marker_line_color='black',
    marker_line_width=0.5,
    colorbar_tickprefix = '',
    colorbar_title = 'Subjectivity Score Average',
))
fig1.update_layout(
    title_text='Subjectivity Analysis',
    geo=dict(
        showframe=False,
        showcoastlines=False,
        projection_type='equirectangular'
    ),
    annotations = [dict(
        x=0.55,
        y=0.1,
        xref='paper',
        yref='paper',
        showarrow = False
    )]
)
## Mexico here has the least subjectivity average score while Mongolia has the highest subjectivity score average ##

In [17]:
###SUBJECTIVITY ANALYSIS STATE WISE- UNITED STATES OF AMERICA

fig = go.Figure(data=go.Choropleth(
    locations=usa_df_2['state_code'], # Spatial coordinates
    z = usa_df_2['subjectivity_score'], # Data to be color-coded
    locationmode = 'USA-states', # set of locations match entries in `locations`
    colorscale = 'aggrnyl',
    colorbar_title = "Subjectivity Average",
))

fig.update_layout(
    title_text = 'US State Wise Subjectivity Mean Average',
    geo_scope='usa', # limite map scope to USA
)

## North Carolina here has the highest subjectivity avergae score of 0.8 and New Mexico had the lowest subjectivity average score of 0 within USA states ##


### TASK 2.4: Interpretation
The reviews are extracted from twitter relevant to Joe Biden during the election
run for President of USA. The reviews are then pre-processed. The unwanted special
characters, and urls in the tweet that are irrelevant to the context are removed.
After the review set is processed, sentiment analysis is done on the dataset,
which is determining whether the tweet made is in support of joe biden or is against.
The extent of polarity and the subjectiveness of the tweet is determined using the
textblob library. It is a natural language processing pipeline that tokenizes the 
review and then assign the score to the words in order to calculate the overall
sentiment of a tweet. The opining mining is very significant in political field.
It can be used to detect the consistency and inconsistency of actions and policies
made by the govermnment or are being talked about. For any candidate participating in
elections, it will be useful to understand general public opinions. If the opinions are 
positive in context then its a good thing for the candidate election run. Also the 
negative reviews must be taken seriously. The candidate can determine the issues from
such reviews and can work to rectify those issues. The location specific analysis 
provide you a wide scope to make strategies that how a particular area can be targeted,
what is the buzz in that area. Even if there are reveiws from areas out of the nation, as
the international opinions matter.
In the above analysis of the dataset, it gets clear that from various parts of the world, 
Joe Biden is having a positive reach. Also the reviews are highly subjective. Joe Biden needs
to focus on the eastern part of USA as it had more negative polarity while in the north and
south region of USA he needs to spend little bit of time as they already him. The countries 
outside USA, their tweets doesn't impact Joe Biden election campaign so he can ignore them.
But countries neighbouring USA, they can impact his election campaign so Joe Biden needs to
focus his efforts towards them as well.
