## Inisiasi

In [1]:
import pymysql
from sqlalchemy import create_engine
import pandas as pd

host = '127.0.0.1'
port = "3306"
user = 'root'
password = 'ugm2018'
database = 'gdelt2'

table_name = 'events_LA'
fips_country_code = 'LA'
country_code = 'LAO'

conn_str = 'mysql+pymysql://{}:{}@{}:{}/{}'.format(user, password, host, port, database);           
engine = create_engine(conn_str)

def run(sql):
    df = pd.read_sql_query(sql,engine)
    return df

def execute(sql):
    return engine.execute(statement=sql)


### Cleansing Strategy
- Observe Events by year
- Make sure date in range of 2003-2017
- Filter by Actor1CountryCode or Actor2CountryCode
- Deduplicate SOURCEURL (contained http/https)
- For null SOURCEURL and non http source, deduplicate news by theirs similiarities (AvgTone, ActionGeoLat, ActionGeoLong, GoldSteinScale, EventCode)

****

### a. Get Total News 
___

In [8]:
total_news = run("""
SELECT         
    count(GLOBALEVENTID) as count 
FROM {} 
""".format(table_name))

print('Total News :', total_news['count'][0])

Total News : 102417


### b. Make sure date in range of 2003-2017
___

#### 1. Preview

In [9]:
out_of_range_counter = run("""
select count(GLOBALEVENTID) as count from {} where Year < 2003 or Year > 2017
""".format(table_name))

print('Out of Range :', out_of_range_counter['count'][0])


Out of Range : 0


#### 2. Execute Deletion

In [None]:
execute("""
delete from events_LA where Year < 2003 or Year > 2017
""")

### c. Filter by Actor1CountryCode or Actor2CountryCode

#### 1. Preview

In [12]:
data = run("""
select GLOBALEVENTID, SOURCEURL, Actor1CountryCode, Actor2CountryCode 
from {} WHERE Actor1CountryCode <> '{}' AND Actor2CountryCode <> '{}'
""".format(table_name, country_code, country_code))

data

Unnamed: 0,GLOBALEVENTID,SOURCEURL,Actor1CountryCode,Actor2CountryCode


#### 2. Execute Deletion

In [13]:
execute("""
delete from events_LA WHERE Actor1CountryCode <> '{}' AND Actor2CountryCode <> '{}'
""".format(table_name, country_code, country_code))

<sqlalchemy.engine.result.ResultProxy at 0x7fb500065358>

### Deduplicate URL (http/https filled data)

In [None]:
# observations

# run("""
#    select SOURCEURL, AvgTone, count(*)
#        from   events_LA
#        where  SOURCEURL like 'http://%%' OR SOURCEURL LIKE 'https://%%'
#        group by SOURCEURL, AvgTone
# """)

run("""
SELECT t1.GLOBALEVENTID, t1.SOURCEURL,  
  ROW_NUMBER() OVER(PARTITION BY t1.SOURCEURL ORDER BY t1.SOURCEURL) AS DuplicateCount
  FROM events_BX t1
  WHERE t1.SOURCEURL like 'http://%%' OR t1.SOURCEURL LIKE 'https://%%'
  """)

# execute("""
#  DELETE FROM events_BX
#  WHERE GLOBALEVENTID in (
#  (SELECT t2.GLOBALEVENTID
#  FROM
#  (SELECT t1.GLOBALEVENTID, t1.SOURCEURL,  
#   ROW_NUMBER() OVER(PARTITION BY t1.SOURCEURL ORDER BY t1.SOURCEURL) AS DuplicateCount
#   FROM events_BX t1
#   WHERE t1.SOURCEURL like 'http://%%' OR t1.SOURCEURL LIKE 'https://%%'
#     ) t2
#   WHERE t2.DuplicateCount > 1)
#   )
#  """)

# run(""
# DELETE FROM events_LA t3
# WHERE 
# t3.GLOBALEVENTID IN (
#      SELECT 
#          t2.GLOBALEVENTID 
#          FROM (
#              SELECT 
#              t1.GLOBALEVENTID,
#              ROW_NUMBER() OVER (
#              PARTITION BY t1.SOURCEURL
#              ORDER BY t1.SOURCEURL) AS row_num
#              FROM 
#              events_LA t1
#              ) t2 
#      WHERE row_num > 1
# );
# """)

# run("""
#       SELECT 
#          t2.GLOBALEVENTID,
#          t2.row_num
#          FROM (
#              SELECT 
#              t1.GLOBALEVENTID,
#              ROW_NUMBER() OVER (
#              PARTITION BY t1.SOURCEURL
#              ORDER BY t1.SOURCEURL) AS row_num
#              FROM 
#              events_LA t1
#              ) t2 
#          WHERE t2.row_num > 1
#      """)