## Read in data

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('../df_labeled.csv')

In [3]:
print(f"Total null location values: {len(df[df['location'].isnull()])}")
print(f"Total values: {len(df)}")

Total null location values: 20855
Total values: 41902


In [4]:
df_clean = df[df['location'].notnull()]

In [5]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21047 entries, 3 to 41899
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       21047 non-null  int64  
 1   text             21047 non-null  object 
 2   userid           21047 non-null  float64
 3   location         21047 non-null  object 
 4   coordinates      15 non-null     object 
 5   translated_text  21047 non-null  object 
 6   label            21047 non-null  float64
dtypes: float64(2), int64(1), object(4)
memory usage: 1.3+ MB


In [6]:
# map locations with latitude and longitude
unique_locations = df_clean['location'].unique()

In [9]:
import requests
from dotenv import load_dotenv
load_dotenv()
import os
from urllib.parse import urlencode
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

non_valid_locations = []

retry_strat = Retry(total=3, backoff_factor=0.5)
adapter = HTTPAdapter(max_retries=retry_strat)
http = requests.Session()
http.mount("https://", adapter=adapter)

def extract_lat_lng(location, data_type= 'json'):
    base_url = f"https://maps.googleapis.com/maps/api/geocode/{data_type}"
    params = {
        "address": location,
        "key": os.environ['API_KEY']
    }
    url_params = urlencode(params)
    url =f"{base_url}?{url_params}"
    r = http.get(url)

    if r.status_code not in range(200, 299) or r.json()['status'] == "ZERO_RESULTS":
        non_valid_locations.append(location)
        return {}

    return r.json()['results'][0]['geometry']['location']

In [10]:
# testing API call to geocoder
r = extract_lat_lng('ukrain')
lat, lon = r.values()
print(lat)
print(lon)

KeyError: 'API_KEY'

In [71]:
string = 'Ukrain'
r = extract_lat_lng(string)
lat, lon = r.values()
lat

48.379433

In [80]:
# iterate through locations and get lat/lon
from tqdm import tqdm

location_map = {}
for location in tqdm(unique_locations):
    res = extract_lat_lng(location=location)
    if not res:
        continue
    lat, lon = res.values()
    location_map[location] = (lat, lon)


100%|██████████| 5643/5643 [11:15<00:00,  8.36it/s]


In [87]:
dict_pre = {}
dict_pre['name'] =  []
dict_pre['lat'] =  []
dict_pre['lon'] =  []
for key, val in location_map.items():
    dict_pre['name'].append(key)
    dict_pre['lat'].append(val[0])
    dict_pre['lon'].append(val[1])

In [93]:
locations_with_lat_and_long = pd.DataFrame(dict_pre)
weird_locations = pd.DataFrame(non_valid_locations, columns=['name'])

In [97]:
locations_with_lat_and_long.to_csv("../locations_lat_lon.csv.gz", compression='gzip')
weird_locations.to_csv('../weird_locations.csv.gz', compression='gzip')

In [39]:
dflocationcoords = pd.read_csv('..\locations_lat_lon.csv.gz')

In [42]:
dflocationcoords =  dflocationcoords.rename(columns = {'name':'location'})
dflocationcoords = dflocationcoords.drop(columns=['Unnamed: 0'])

In [43]:
dflocationcoords

Unnamed: 0,location,lat,lon
0,Ukraine,48.379433,31.165580
1,"Ottawa, Ontario",45.421530,-75.697193
2,"EU, D, Berlin, SEA",52.520007,13.404954
3,"Uzda, Minsk region",53.465999,27.202858
4,"California, USA",36.778261,-119.417932
...,...,...,...
3488,ростов,47.235714,39.701505
3489,Paris 11ème,48.857808,2.380273
3490,"Amsterdam, NL",52.367573,4.904139
3491,"Vinnitsya, Ukraine",49.233083,28.468217


In [58]:
mergedcoords = df_clean.merge(dflocationcoords, on=['location'])
mergedcoords


Unnamed: 0.1,Unnamed: 0,text,userid,location,coordinates,translated_text,label,lat,lon
0,5,если после последних ночей обороны ктото скаж...,1.704151e+09,Ukraine,,"if, after the last nights of defense, someone ...",1.0,48.379433,31.165580
1,264,осторожно в частном секторе ходят группами ха...,1.257620e+18,Ukraine,,carefully in the private sector walk in groups...,1.0,48.379433,31.165580
2,292,часа сопротивления и обороны киев не сдается ...,3.305001e+09,Ukraine,,hour of resistance and defense kyiv does not g...,1.0,48.379433,31.165580
3,314,ukraine русские захватили наш аэропорт и сде...,1.491339e+18,Ukraine,,ukraine Russians captured our airport and made...,1.0,48.379433,31.165580
4,369,харьков пленные свиньи ukraine,1.257620e+18,Ukraine,,kharkiv captive pigs ukraine,1.0,48.379433,31.165580
...,...,...,...,...,...,...,...,...,...
17158,41946,ukraine️ putin запобеду z ukrainewar olafschol...,1.494355e+18,ростов,,ukraine️ putin victory z ukrainewar olafscholz...,0.0,47.235714,39.701505
17159,41985,russianembassy mfarussia fcdogovuk trussliz de...,1.001425e+09,Paris 11ème,,russianembassy mfarussia fcdogovuk trussliz de...,1.0,48.857808,2.380273
17160,41990,пусть весь мир видит standwithukraine putinwa...,5.946962e+08,"Amsterdam, NL",,let the whole world see standwithukraine putin...,1.0,52.367573,4.904139
17161,42037,кто сдаёт украину… через youtube украина войн...,6.090294e+08,"Vinnitsya, Ukraine",,who surrenders ukraine... through youtube ukra...,1.0,49.233083,28.468217


In [57]:
mergedcoords = mergedcoords.drop(mergedcoords.columns[[0,1,2,4,5]], axis=1)
mergedcoords

Unnamed: 0,location,label,lat,lon
0,Ukraine,1.0,48.379433,31.165580
1,Ukraine,1.0,48.379433,31.165580
2,Ukraine,1.0,48.379433,31.165580
3,Ukraine,1.0,48.379433,31.165580
4,Ukraine,1.0,48.379433,31.165580
...,...,...,...,...
17158,ростов,0.0,47.235714,39.701505
17159,Paris 11ème,1.0,48.857808,2.380273
17160,"Amsterdam, NL",1.0,52.367573,4.904139
17161,"Vinnitsya, Ukraine",1.0,49.233083,28.468217
