# Importation of the necessary libraries

## Necessary libraries for data analysis

In [1]:
import pandas as pd
import numpy as np

## Necessary libraries for data visualization

In [2]:
import matplotlib.pyplot as plt

In [3]:
#import folium
#from folium.plugins import HeatMapWithTime, TimestampedGeoJson
#
#import seaborn as sns

# Preliminary investigation of the primary data set

In [4]:
# URL: https://www.kaggle.com/gpreda/covid19-tweets
covid19_tweets = pd.read_csv("covid19_tweets.csv")

covid19_tweets.head()

Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,is_retweet
0,ᏉᎥ☻լꂅϮ,astroworld,wednesday addams as a disney princess keepin i...,2017-05-26 05:46:42,624,950,18775,False,2020-07-25 12:27:21,If I smelled the scent of hand sanitizers toda...,,Twitter for iPhone,False
1,Tom Basile 🇺🇸,"New York, NY","Husband, Father, Columnist & Commentator. Auth...",2009-04-16 20:06:23,2253,1677,24,True,2020-07-25 12:27:17,Hey @Yankees @YankeesPR and @MLB - wouldn't it...,,Twitter for Android,False
2,Time4fisticuffs,"Pewee Valley, KY",#Christian #Catholic #Conservative #Reagan #Re...,2009-02-28 18:57:41,9275,9525,7254,False,2020-07-25 12:27:14,@diane3443 @wdunlap @realDonaldTrump Trump nev...,['COVID19'],Twitter for Android,False
3,ethel mertz,Stuck in the Middle,#Browns #Indians #ClevelandProud #[]_[] #Cavs ...,2019-03-07 01:45:06,197,987,1488,False,2020-07-25 12:27:10,@brookbanktv The one gift #COVID19 has give me...,['COVID19'],Twitter for iPhone,False
4,DIPR-J&K,Jammu and Kashmir,🖊️Official Twitter handle of Department of Inf...,2017-02-12 06:45:15,101009,168,101,False,2020-07-25 12:27:08,25 July : Media Bulletin on Novel #CoronaVirus...,"['CoronaVirusUpdates', 'COVID19']",Twitter for Android,False


In [5]:
covid19_tweets.shape

(179108, 13)

In [6]:
covid19_tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 179108 entries, 0 to 179107
Data columns (total 13 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   user_name         179108 non-null  object
 1   user_location     142337 non-null  object
 2   user_description  168822 non-null  object
 3   user_created      179108 non-null  object
 4   user_followers    179108 non-null  int64 
 5   user_friends      179108 non-null  int64 
 6   user_favourites   179108 non-null  int64 
 7   user_verified     179108 non-null  bool  
 8   date              179108 non-null  object
 9   text              179108 non-null  object
 10  hashtags          127774 non-null  object
 11  source            179031 non-null  object
 12  is_retweet        179108 non-null  bool  
dtypes: bool(2), int64(3), object(8)
memory usage: 15.4+ MB


In [7]:
covid19_tweets.columns

Index(['user_name', 'user_location', 'user_description', 'user_created',
       'user_followers', 'user_friends', 'user_favourites', 'user_verified',
       'date', 'text', 'hashtags', 'source', 'is_retweet'],
      dtype='object')

In [8]:
df_1 = covid19_tweets[['user_location', 'text', 'hashtags', 'source']].copy()
df_1

Unnamed: 0,user_location,text,hashtags,source
0,astroworld,If I smelled the scent of hand sanitizers toda...,,Twitter for iPhone
1,"New York, NY",Hey @Yankees @YankeesPR and @MLB - wouldn't it...,,Twitter for Android
2,"Pewee Valley, KY",@diane3443 @wdunlap @realDonaldTrump Trump nev...,['COVID19'],Twitter for Android
3,Stuck in the Middle,@brookbanktv The one gift #COVID19 has give me...,['COVID19'],Twitter for iPhone
4,Jammu and Kashmir,25 July : Media Bulletin on Novel #CoronaVirus...,"['CoronaVirusUpdates', 'COVID19']",Twitter for Android
...,...,...,...,...
179103,"Ilorin, Nigeria",Thanks @IamOhmai for nominating me for the @WH...,['WearAMask'],Twitter for Android
179104,Ontario,2020! The year of insanity! Lol! #COVID19 http...,['COVID19'],Twitter for Android
179105,🇨🇦 Canada,@CTVNews A powerful painting by Juan Lucena. I...,,Twitter Web App
179106,New York City,"More than 1,200 students test positive for #CO...",['COVID19'],Twitter for iPhone


# Geospatial distribution of Tweets

## Data preparation

In [9]:
df_1["user_location"].isnull().sum()

36771

In [10]:
df_1["user_location"][:20]

0                                            astroworld
1                                          New York, NY
2                                      Pewee Valley, KY
3                                  Stuck in the Middle 
4                                     Jammu and Kashmir
5                                           Новоро́ссия
6                                       Gainesville, FL
7                                                   NaN
8                                                   NaN
9                            👇🏻location at link below👇🏻
10                                                  NaN
11                                     Dhaka,Bangladesh
12    Hotel living - various cities!  Who needs a ho...
13                                               Africa
14                                            New Delhi
15                                      Nagaland, India
16                                                  NaN
17                                             B

In [11]:
location = df_1["user_location"].fillna(value='').str.split(r'[\s]*,[\s]*|[\s]+$')
df_1['location'] = location
df_1['location'][:20]

0                                          [astroworld]
1                                        [New York, NY]
2                                    [Pewee Valley, KY]
3                               [Stuck in the Middle, ]
4                                   [Jammu and Kashmir]
5                                         [Новоро́ссия]
6                                     [Gainesville, FL]
7                                                    []
8                                                    []
9                          [👇🏻location at link below👇🏻]
10                                                   []
11                                  [Dhaka, Bangladesh]
12    [Hotel living - various cities!  Who needs a h...
13                                             [Africa]
14                                          [New Delhi]
15                                    [Nagaland, India]
16                                                   []
17                                           [Br

In [12]:
count = 0
for i in range(len(df_1.index)):
    if count < 20:
        if len(df_1['location'][i]) > 1:
            print(df_1['location'][i])
            count += 1
    else:
        break

['New York', 'NY']
['Pewee Valley', 'KY']
['Stuck in the Middle', '']
['Gainesville', 'FL']
['Dhaka', 'Bangladesh']
['Nagaland', 'India']
['Florida', 'USA']
['Mumbai', 'India']
['Manhattan', 'NY']
['Chennai', 'India']
['Miami', 'FL']
['British Columbia', 'Canada']
['Farnham', 'Surrey']
['Dorset', 'UK']
['Bethesda', 'Maryland']
['Cavan', 'Ireland']
['Port Elizabeth', 'South Africa']
['New Delhi', 'India']
['Larose', 'LA']
['A-009', 'SEC-68', 'NOIDA']


## Importation of the necessary auxiliary data set

In [29]:
# URL: https://www.kaggle.com/max-mind/world-cities-database
world_cities = pd.read_csv('world_cities.csv')

world_cities.head()

Unnamed: 0,city,city_ascii,lat,lng,country,iso2,iso3,admin_name,capital,population,id
0,Tokyo,Tokyo,35.6897,139.6922,Japan,JP,JPN,Tōkyō,primary,37977000.0,1392685764
1,Jakarta,Jakarta,-6.2146,106.8451,Indonesia,ID,IDN,Jakarta,primary,34540000.0,1360771077
2,Delhi,Delhi,28.66,77.23,India,IN,IND,Delhi,admin,29617000.0,1356872604
3,Mumbai,Mumbai,18.9667,72.8333,India,IN,IND,Mahārāshtra,admin,23355000.0,1356226629
4,Manila,Manila,14.5958,120.9772,Philippines,PH,PHL,Manila,primary,23088000.0,1608618140


In [30]:
world_cities.shape

(26569, 11)

In [31]:
world_cities.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26569 entries, 0 to 26568
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   city        26569 non-null  object 
 1   city_ascii  26569 non-null  object 
 2   lat         26569 non-null  float64
 3   lng         26569 non-null  float64
 4   country     26569 non-null  object 
 5   iso2        26538 non-null  object 
 6   iso3        26569 non-null  object 
 7   admin_name  26493 non-null  object 
 8   capital     7626 non-null   object 
 9   population  25596 non-null  float64
 10  id          26569 non-null  int64  
dtypes: float64(3), int64(1), object(7)
memory usage: 2.2+ MB


In [32]:
world_cities.columns

Index(['city', 'city_ascii', 'lat', 'lng', 'country', 'iso2', 'iso3',
       'admin_name', 'capital', 'population', 'id'],
      dtype='object')

In [33]:
df_2 = world_cities[['city', 'iso2', 'iso3', 'country', 'admin_name']].copy()
df_2

Unnamed: 0,city,iso2,iso3,country,admin_name
0,Tokyo,JP,JPN,Japan,Tōkyō
1,Jakarta,ID,IDN,Indonesia,Jakarta
2,Delhi,IN,IND,India,Delhi
3,Mumbai,IN,IND,India,Mahārāshtra
4,Manila,PH,PHL,Philippines,Manila
...,...,...,...,...,...
26564,Nord,GL,GRL,Greenland,Sermersooq
26565,Timmiarmiut,GL,GRL,Greenland,Kujalleq
26566,Cheremoshna,UA,UKR,Ukraine,Kyyivs’ka Oblast’
26567,Ambarchik,RU,RUS,Russia,Sakha (Yakutiya)


In [34]:
# https://gist.github.com/tadast/8827699
countries_codes_and_coordinates = pd.read_csv(
    'countries_codes_and_coordinates.csv')

countries_codes_and_coordinates.head()

Unnamed: 0,Country,Alpha-2 code,Alpha-3 code,Numeric code,Latitude (average),Longitude (average)
0,Afghanistan,"""AF""","""AFG""","""4""","""33""","""65"""
1,Albania,"""AL""","""ALB""","""8""","""41""","""20"""
2,Algeria,"""DZ""","""DZA""","""12""","""28""","""3"""
3,American Samoa,"""AS""","""ASM""","""16""","""-14.3333""","""-170"""
4,Andorra,"""AD""","""AND""","""20""","""42.5""","""1.6"""


In [35]:
countries_codes_and_coordinates.shape

(256, 6)

In [36]:
countries_codes_and_coordinates.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 256 entries, 0 to 255
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Country              256 non-null    object
 1   Alpha-2 code         256 non-null    object
 2   Alpha-3 code         256 non-null    object
 3   Numeric code         256 non-null    object
 4   Latitude (average)   256 non-null    object
 5   Longitude (average)  256 non-null    object
dtypes: object(6)
memory usage: 12.1+ KB


In [37]:
countries_codes_and_coordinates.columns

Index(['Country', 'Alpha-2 code', 'Alpha-3 code', 'Numeric code',
       'Latitude (average)', 'Longitude (average)'],
      dtype='object')

In [38]:
df_3 = countries_codes_and_coordinates[[
    'Country', 'Alpha-2 code', 'Alpha-3 code', 'Latitude (average)',
    'Longitude (average)'
]].copy()
df_3

Unnamed: 0,Country,Alpha-2 code,Alpha-3 code,Latitude (average),Longitude (average)
0,Afghanistan,"""AF""","""AFG""","""33""","""65"""
1,Albania,"""AL""","""ALB""","""41""","""20"""
2,Algeria,"""DZ""","""DZA""","""28""","""3"""
3,American Samoa,"""AS""","""ASM""","""-14.3333""","""-170"""
4,Andorra,"""AD""","""AND""","""42.5""","""1.6"""
...,...,...,...,...,...
251,Wallis and Futuna,"""WF""","""WLF""","""-13.3""","""-176.2"""
252,Western Sahara,"""EH""","""ESH""","""24.5""","""-13"""
253,Yemen,"""YE""","""YEM""","""15""","""48"""
254,Zambia,"""ZM""","""ZMB""","""-15""","""30"""


## 

# .

In [None]:
countries_codes = []
nb = 0
for code in avg_countries_location['Alpha-2 code'].str.replace(
        '"', '').str.strip().to_list():
    if code not in countries_codes:
        countries_codes.append(code)
    else:
        corresponding_name = avg_countries_location['Country'].to_list()[nb]
        print('The repeated code is {}, and its repeated name is "{}".'.format(
            code, corresponding_name))
    nb += 1
len(countries_codes)

In [None]:
world_cities_iso2 = []
for iso2 in world_cities['iso2'].to_list():
    if iso2 not in world_cities_iso2:
        world_cities_iso2.append(iso2)
len(world_cities_iso2)

In [None]:
for city in world_cities_iso2:
    if city not in countries_codes:
        print(city)

# .

In [None]:
# URL: https://developers.google.com/public-data/docs/canonical/countries_csv
# 'XK': 'Kosovo'
# 'XW': None
# 'CW': None
codes = avg_countries_location['Alpha-2 code'].str.replace(
    '"', '').str.strip().to_list() + ['XK']
code_latitude = avg_countries_location['Latitude (average)'].str.replace(
    '"', '').to_list() + ['42.602636']
code_longitude = avg_countries_location['Longitude (average)'].str.replace(
    '"', '').to_list() + ['20.902977']

# .

In [17]:
lat = world_cities['lat'].fillna(value='').to_list()
lng = world_cities['lng'].fillna(value='').to_list()

NameError: name 'world_cities' is not defined

In [18]:
world_cities_iso3 = []
for iso3 in world_cities['iso3'].to_list():
    if iso3 not in world_cities_iso3:
        world_cities_iso3.append(iso3)
len(world_cities_iso3)

NameError: name 'world_cities' is not defined

In [19]:
world_cities_iso2 = []
for iso2 in world_cities['iso2'].to_list():
    if iso2 not in world_cities_iso2:
        world_cities_iso2.append(iso2)
len(world_cities_iso2)

NameError: name 'world_cities' is not defined

In [20]:
world_cities_country = []
for country in world_cities['country'].to_list():
    if country not in world_cities_country:
        world_cities_country.append(country)
len(world_cities_country)

NameError: name 'world_cities' is not defined

In [21]:
city_admin_name = world_cities['admin_name'].to_list()
len(city_admin_name)

NameError: name 'world_cities' is not defined

In [22]:
world_city = world_cities['city'].fillna(value='').to_list()
len(world_city)

NameError: name 'world_cities' is not defined

# .

In [23]:
for ind in range(1000):
    order = [False, False, False, False, False]
    each_loc = location[ind]
    for each in each_loc:
        each = each.strip()
        if each in world_city:
            order[0] = world_city.index(each)
        if each in city_admin_name:
            order[1] = city_admin_name.index(each)
        if each in world_cities_country:
            order[2] = world_cities_country.index(each)
        if each in world_cities_iso2:
            order[3] = world_cities_iso2.index(each)
        if each in world_cities_iso3:
            order[4] = world_cities_iso3.index(each)
    if order[0]:
        covid_tweets['latitude'][ind] = lat[order[0]]
        covid_tweets['longitude'][ind] = lng[order[0]]
        continue
    if order[1]:
        covid_tweets['latitude'][ind] = lat[order[1]]
        covid_tweets['longitude'][ind] = lng[order[1]]
        continue
    if order[2]:
        try:
            covid_tweets['latitude'][ind] = code_latitude[codes.index(
                world_city_iso2[order[2]].upper())]
            covid_tweets['longitude'][ind] = code_longitude[codes.index(
                world_city_iso2[order[2]].upper())]
        except:
            pass
        continue
    if order[3]:
        covid_tweets['latitude'][ind] = code_latitude[codes.index(
            world_cities_iso2[order[3]].upper())]
        covid_tweets['longitude'][ind] = code_longitude[codes.index(
            world_cities_iso2[order[3]].upper())]
        continue
    if order[4]:
        covid_tweets['latitude'][ind] = code_latitude[codes.index(
            world_cities_iso2[order[4]].upper())]
        covid_tweets['longitude'][ind] = code_longitude[codes.index(
            world_cities_iso2[order[4]].upper())]
        continue

NameError: name 'world_city' is not defined

In [24]:
covid_tweets.head()

NameError: name 'covid_tweets' is not defined

In [25]:
lat[5]

NameError: name 'lat' is not defined

In [26]:
len(covid_tweets.latitude.unique())

NameError: name 'covid_tweets' is not defined

In [27]:
location

0                         [astroworld]
1                       [New York, NY]
2                   [Pewee Valley, KY]
3              [Stuck in the Middle, ]
4                  [Jammu and Kashmir]
                      ...             
179103               [Ilorin, Nigeria]
179104                       [Ontario]
179105                     [🇨🇦 Canada]
179106                 [New York City]
179107    [Aliwal North, South Africa]
Name: user_location, Length: 179108, dtype: object

In [28]:
'New York' in world_city

NameError: name 'world_city' is not defined