# Test converting UTC time to local time

In [1]:
%load_ext autoreload
%autoreload 2

In [30]:
import os
import sys
import subprocess
import pandas as pd
from tzwhere import tzwhere
from dateutil import tz

tzg = tzwhere.tzwhere(forceTZ=True)

def get_repo_root():
    """Get the root directory of the repo."""
    dir_in_repo = os.path.dirname(os.path.abspath('__file__'))
    return subprocess.check_output('git rev-parse --show-toplevel'.split(),
                                   cwd=dir_in_repo,
                                   universal_newlines=True).rstrip()


ROOT_dir = get_repo_root()
sys.path.append(ROOT_dir)
sys.path.insert(0, ROOT_dir + '/lib')
import lib.preprocess as preprocess

def where_self(row):
    try:
        x = tzg.tzNameAt(row["lat"], row["lng"], forceTZ=True)
    except:
        x = "Unknown"
    return x

  return array(a, dtype, copy=False, order=order)


## 1. Load data

In [24]:
path = os.path.join(ROOT_dir, 'dbs/tweets_20201123_se_geolocations.csv')
df = pd.read_csv(path).loc[:, ['time', 'lat', 'lng']] # , nrows=1000
df.head()

Unnamed: 0,time,lat,lng
0,Sep 13 15:25:48 2020,63.1833,14.65
1,Aug 24 22:59:05 2020,59.585902,17.069324
2,May 17 13:58:47 2020,59.654082,17.106059
3,Mar 29 18:19:02 2020,63.1833,14.65
4,Dec 13 14:36:22 2019,59.654082,17.106059


## 2. Time process

In [25]:
# Convert time to local time
df.loc[:, 'time'] = pd.to_datetime(df['time'], infer_datetime_format=True)
df.iloc[0]

time    2020-09-13 15:25:48
lat                 63.1833
lng                   14.65
Name: 0, dtype: object

In [26]:
df['time_zone'] = df.apply(lambda row: where_self(row), axis=1)
df.iloc[0]

time         2020-09-13 15:25:48
lat                      63.1833
lng                        14.65
time_zone       Europe/Stockholm
Name: 0, dtype: object

In [27]:
df.time_zone.unique()

array(['Europe/Stockholm', 'Asia/Makassar', 'Asia/Dubai', 'Asia/Shanghai',
       'Europe/Paris', 'Europe/Warsaw', 'Europe/Vatican', 'Europe/Rome',
       'Europe/Istanbul', 'Europe/Oslo', 'America/Winnipeg',
       'Europe/Copenhagen', 'Europe/Budapest', 'Europe/Athens',
       'Europe/Helsinki', 'Europe/Berlin', 'Europe/London',
       'Europe/Madrid', 'Europe/Lisbon', 'Atlantic/Reykjavik',
       'Atlantic/Canary', 'Europe/Ljubljana', 'Europe/Amsterdam',
       'America/Los_Angeles', 'America/New_York', 'America/Detroit',
       'Africa/Cairo', 'Asia/Ho_Chi_Minh', 'Asia/Kuwait', 'Europe/Zurich',
       'Europe/Moscow', 'Asia/Baku', 'Asia/Riyadh', 'Asia/Beirut',
       'Asia/Yekaterinburg', 'Europe/Vienna', 'Europe/Bratislava',
       'Europe/Mariehamn', 'Asia/Bangkok', 'Europe/Brussels',
       'Indian/Maldives', 'Asia/Colombo', 'Asia/Chongqing',
       'Asia/Jakarta', 'Asia/Kuala_Lumpur', 'America/Sao_Paulo',
       'Asia/Qatar', 'Atlantic/Madeira', 'Asia/Nicosia', 'Europe/Dublin',

In [28]:
df = df.loc[(df.time_zone != "Unknown") & (df.time_zone != "uninhabited"), :]
df.loc[:, 'time_local'] = df.groupby('time_zone')['time'].apply(lambda x: x.dt.tz_localize('UTC').dt.tz_convert(x.name))
df.iloc[0]


time                2020-09-13 15:25:48
lat                             63.1833
lng                               14.65
time_zone              Europe/Stockholm
time_local    2020-09-13 17:25:48+02:00
Name: 0, dtype: object

In [29]:
df.dropna(how='any', inplace=True)
df.loc[:, "date"] = df.loc[:, "time_local"].apply(lambda x: x.date())
df.loc[:, "hourofday"] = df.loc[:, "time_local"].apply(lambda x: x.hour)
df.loc[:, "weekday"] = df.loc[:, "time_local"].apply(lambda x: x.weekday())
df.iloc[0]

time                2020-09-13 15:25:48
lat                             63.1833
lng                               14.65
time_zone              Europe/Stockholm
time_local    2020-09-13 17:25:48+02:00
date                         2020-09-13
hourofday                            17
weekday                               6
Name: 0, dtype: object

## 3. Test the whole preprocess

In [33]:
tw = preprocess.GeotweetsProcessor()

# Filtering geotweets and save
tw.tweets_load()
tw.tweets_labeler_boundary()
print("Geotagged tweets labelled: domestic vs international.")

tw.tweets_filter_precise_geolocation()
print("Place/cross-posting geotagged tweets removed.")

tw.tweets_time_processor()
print("UTC time converted to local time.")
tw.geotweets.iloc[0]

  return array(a, dtype, copy=False, order=order)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super(GeoDataFrame, self).__setitem__(key, value)


Geotagged tweets labelled: domestic vs international.
Removing 2.42 percentage of center-of-region geotweets
Place/cross-posting geotagged tweets removed.
UTC time converted to local time.


tw_id                                            1305165791109828611
time                                             2020-09-13 15:25:48
geo_label                                                          1
lat                                                          63.1833
lng                                                            14.65
place_label                                                        1
place_id                                            80fa7d473fb9a545
place_country                                                 Sweden
place_full_name                                   Östersund, Sverige
content            Haft en helt underbar sommar med familjen!❤️\n...
user_name                                                  100021601
user_location                                                 Sweden
user_descp                                                       NaN
user_time_zone                                                   NaN
user_utc_offset                   