---
# <span style="color:purple"> CLEANSING DATA </span>
---

## <span style="color:purple"> Inisiasi </span>

In [2]:
import pymysql
from sqlalchemy import create_engine
import pandas as pd

host = '127.0.0.1'
port = "3306"
user = 'root'
password = 'ugm2018'
database = 'gdelt2'

table_name = 'events_LA'
fips_country_code = 'LA'
country_code = 'LAO'
# BRN, KHM, LAO, IDN, MYS, VNM, THA, SGP, PHL, MMR

conn_str = 'mysql+pymysql://{}:{}@{}:{}/{}'.format(user, password, host, port, database);           
engine = create_engine(conn_str)

def run(sql):
    df = pd.read_sql_query(sql,engine)
    return df

def execute(sql):
    return engine.execute(statement=sql)


### <span style="color:purple"> Cleansing Strategy </span>
- Observe Events by year
- Make sure date in range of 2003-2017
- Filter by Actor1CountryCode or Actor2CountryCode
- Deduplicate SOURCEURL (contained http/https)
- For null SOURCEURL and non http source, deduplicate news by theirs similiarities (AvgTone, ActionGeoLat, ActionGeoLong, GoldSteinScale, EventCode)

****

### a. Get Total News (Before Cleansing)
___

In [3]:
total_news = run("""
SELECT         
    count(GLOBALEVENTID) as count 
FROM {} 
""".format(table_name))

print('Total News :', total_news['count'][0])

Total News : 1913966


### b. Make sure date in range of 2003-2017
___

#### 1. Preview

In [4]:
out_of_range_counter = run("""
select count(GLOBALEVENTID) as count from {} where Year < 2003 or Year > 2017
""".format(table_name))

print('Out of Range :', out_of_range_counter['count'][0])


Out of Range : 285891


#### 2. Execute Deletion

In [5]:
execute("""
delete from {} where Year < 2003 or Year > 2017
""".format(table_name))

total_news = run("""
SELECT         
    count(GLOBALEVENTID) as count 
FROM {} 
""".format(table_name))

print('Total News Now:', total_news['count'][0])

Total News Now: 1628075


### c. Filter by Actor1CountryCode or Actor2CountryCode
---

#### 1. Preview

In [None]:
data = run("""
select GLOBALEVENTID, SOURCEURL, Actor1CountryCode, Actor2CountryCode 
from {} WHERE Actor1CountryCode <> '{}' AND Actor2CountryCode <> '{}'
""".format(table_name, country_code, country_code))

data

#### 2. Execute Deletion

In [6]:
execute("""
delete from {} WHERE Actor1CountryCode <> '{}' AND Actor2CountryCode <> '{}'
""".format(table_name, country_code, country_code))

total_news = run("""
SELECT         
    count(GLOBALEVENTID) as count 
FROM {} 
""".format(table_name))

print('Total News Now:', total_news['count'][0])

Total News Now: 872095


### d. Deduplicate by SOURCEURL (contained http or https )
----

#### 1. Preview

In [7]:
duplicateURL = run("""
SELECT t1.GLOBALEVENTID, t1.SOURCEURL, t1.NumArticles,
  ROW_NUMBER() OVER(PARTITION BY t1.SOURCEURL ORDER BY t1.SOURCEURL, t1.NumArticles DESC) AS DuplicateCount
  FROM {} t1
  WHERE t1.SOURCEURL like 'http://%%' OR t1.SOURCEURL LIKE 'https://%%'  
  
  """.format(table_name))

duplicateURL

Unnamed: 0,GLOBALEVENTID,SOURCEURL,NumArticles,DuplicateCount
0,422165081,http://02elf.net/allgemein/evorich-flooring-on...,10,1
1,328639598,http://02elf.net/allgemein/media-release-austr...,4,1
2,422187573,http://02elf.net/allgemein/montreal-long-dista...,9,1
3,481587263,http://0lf.net/2015/11/03/sisi-looking-with-th...,2,1
4,464237099,http://0lf.net/newspapers-cairo-results-sisi-r...,6,1
5,464237097,http://0lf.net/newspapers-cairo-results-sisi-r...,3,2
6,464237102,http://0lf.net/newspapers-cairo-results-sisi-r...,3,3
7,464237098,http://0lf.net/newspapers-cairo-results-sisi-r...,1,4
8,462209321,http://0lf.net/sisi-up-singapore-at-the-start-...,1,1
9,462209323,http://0lf.net/sisi-up-singapore-at-the-start-...,1,2


#### 2. Execute Deletion

In [11]:
execute("""
 DELETE FROM {}
 WHERE GLOBALEVENTID in (
 (SELECT t2.GLOBALEVENTID
 FROM
 (SELECT t1.GLOBALEVENTID, t1.SOURCEURL,  
  ROW_NUMBER() OVER(PARTITION BY t1.SOURCEURL ORDER BY t1.SOURCEURL, t1.NumArticles DESC) AS DuplicateCount
  FROM {} t1
  WHERE t1.SOURCEURL like 'http://%%' OR t1.SOURCEURL LIKE 'https://%%'
    ) t2
  WHERE t2.DuplicateCount > 1)
  )
 """.format(table_name, table_name))

total_news = run("""
SELECT         
    count(GLOBALEVENTID) as count 
FROM {} 
""".format(table_name,table_name))

print('Total News Now:', total_news['count'][0])

Total News Now: 516507


### d. Deduplicate by SOURCEURL (other than contained http or https )
----
Key: Have similarities in SQLDATE, AvgTone, ActionGeoLat, ActionGeoLong, GoldSteinScale, EventCode

#### 1. Preview

In [12]:
duplicateURL2 = run("""
SELECT t1.GLOBALEVENTID, t1.NumArticles, t1.SOURCEURL, t1.SQLDATE,t1.AvgTone, t1.ActionGeo_Lat, t1.ActionGeo_Long,
  ROW_NUMBER() OVER(PARTITION BY t1.SQLDATE,t1.AvgTone, t1.ActionGeo_Lat, t1.ActionGeo_Long
  ORDER BY t1.SQLDATE, t1.AvgTone, t1.NumArticles DESC) AS DuplicateCount
  FROM {} t1  
  WHERE t1.SOURCEURL not like '%%http://%%' OR t1.SOURCEURL not LIKE '%%https://%%'
  """.format(table_name))

duplicateURL2

Unnamed: 0,GLOBALEVENTID,NumArticles,SOURCEURL,SQLDATE,AvgTone,ActionGeo_Lat,ActionGeo_Long,DuplicateCount
0,44569065,9,,20030101,2.98507462686567,1.3667,103.8,1
1,44569075,9,,20030101,2.98507462686567,1.3667,103.8,2
2,44569066,9,,20030101,5.54279149913843,1.3667,103.8,1
3,44562533,2,,20030101,6.19469026548673,1.3667,103.8,1
4,44569072,2,,20030101,6.19469026548673,1.3667,103.8,2
5,44569080,18,,20030101,6.259995735153,1.3667,103.8,1
6,44569079,4,,20030101,6.25999573515301,1.3667,103.8,1
7,44569069,2,,20030101,6.32530120481928,1.3667,103.8,1
8,44569073,9,,20030101,6.32530120481928,35,105,1
9,44562532,2,,20030101,6.32530120481928,35,105,2


#### 2. Execute Deletion

In [13]:
execute("""
 DELETE FROM {}
 WHERE GLOBALEVENTID in (
 (SELECT t2.GLOBALEVENTID
 FROM
 (SELECT t1.GLOBALEVENTID,
  ROW_NUMBER() OVER(PARTITION BY t1.SQLDATE, t1.AvgTone, t1.ActionGeo_Lat, t1.ActionGeo_Long, t1.SOURCEURL
  ORDER BY t1.SQLDATE, t1.AvgTone, t1.NumArticles DESC) AS DuplicateCount
  FROM {} t1 WHERE t1.SOURCEURL not like 'http://%%' OR t1.SOURCEURL not LIKE 'https://%%') t2
  WHERE t2.DuplicateCount > 1)
  )
 """.format(table_name, table_name))

total_news = run("""
SELECT         
    count(GLOBALEVENTID) as count 
FROM {} 
""".format(table_name,table_name))

print('Total News Now:', total_news['count'][0])

Total News Now: 370718


### f. Get Total News (After Cleansing)
___

In [14]:
total_news = run("""
SELECT         
    count(GLOBALEVENTID) as count 
FROM {} 
""".format(table_name))

print('Total News :', total_news['count'][0])

Total News : 370718


---
# <span style="color:purple"> REKAP DATA </span>
---

#### 1. Overview Result

In [15]:
overviewResult = run("""
SELECT 
    Year, 
    count(a.GLOBALEVENTID) as `Total Events`     
FROM {} a GROUP BY Year ORDER BY Year
  """.format(table_name))

overviewResult

Unnamed: 0,Year,Total Events
0,2003,5272
1,2004,3748
2,2005,3202
3,2006,5502
4,2007,9927
5,2008,12120
6,2009,22541
7,2010,20164
8,2011,26774
9,2012,28202


#### 2. Rekap Event

In [16]:

event = run("""
SELECT         
    Year, 
    CONCAT(EventCode,'. ',(SELECT description FROM CAMEO_events_code WHERE code = EventCode)) as Description,
    count(GLOBALEVENTID) as Jumlah
FROM {} 
WHERE Year > 2002
GROUP BY EventCode, Year
  """.format(table_name))

event.pivot(index='Description', columns='Year', values='Jumlah')

Year,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017
Description,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
"010. Make statement, not specified below",376.0,264.0,265.0,431.0,836.0,998.0,1697.0,1688.0,2478.0,2307.0,2299.0,2896.0,4594.0,4985.0,4009.0
011. Decline comment,8.0,8.0,9.0,11.0,28.0,10.0,35.0,46.0,47.0,38.0,35.0,42.0,93.0,78.0,72.0
012. Make pessimistic comment,65.0,54.0,40.0,89.0,177.0,166.0,410.0,321.0,430.0,385.0,460.0,514.0,857.0,970.0,718.0
013. Make optimistic comment,102.0,83.0,56.0,129.0,217.0,297.0,535.0,474.0,593.0,552.0,547.0,681.0,1230.0,1430.0,1071.0
014. Consider policy option,31.0,22.0,29.0,71.0,98.0,97.0,221.0,170.0,292.0,281.0,280.0,323.0,625.0,581.0,395.0
015. Acknowledge or claim responsibility,10.0,18.0,9.0,20.0,19.0,54.0,77.0,114.0,104.0,134.0,114.0,168.0,211.0,324.0,219.0
016. Deny responsibility,15.0,5.0,9.0,12.0,30.0,15.0,27.0,24.0,48.0,72.0,61.0,33.0,99.0,114.0,81.0
017. Engage in symbolic act,10.0,16.0,14.0,22.0,62.0,32.0,116.0,147.0,152.0,165.0,228.0,275.0,1125.0,569.0,361.0
018. Make empathetic comment,14.0,9.0,15.0,5.0,20.0,49.0,49.0,78.0,48.0,80.0,80.0,71.0,585.0,243.0,92.0
019. Express accord,1.0,,,1.0,1.0,3.0,1.0,,4.0,10.0,5.0,6.0,9.0,8.0,9.0


#### 2. Annual AvgTone Data

In [17]:
rekap_akhir = run("""
SELECT 
    Year, 
    count(a.GLOBALEVENTID) as `Total Events`, 
    sum(CASE WHEN AvgTone > 0 THEN 1 ELSE 0 END) as `Total +AvgTone`,
    sum(CASE WHEN AvgTone < 0 THEN 1 ELSE 0 END) as `Total -AvgTone`,    
    sum(CASE WHEN AvgTone = 0 THEN 1 ELSE 0 END) as `Total Netral AvgTone`,    
    avg(AvgTone) as `AvgTone`,
    avg(CASE WHEN AvgTone > 0 THEN AvgTone ELSE 0 END) as `+AvgTone`,
    avg(CASE WHEN AvgTone < 0 THEN AvgTone ELSE 0 END) as `-AvgTone`,    
    (sum(AvgTone * NumArticles)/sum(NumArticles)) as `Weighted AvgTone`    
FROM {} a GROUP BY Year ORDER BY Year
  """.format(table_name))

# max(case when AvgTone >= 0 then AvgTone end) max_positive,
# max(case when AvgTone < 0 then AvgTone end) max_negative
rekap_akhir.to_csv('rekap' + fips_country_code + '.csv')
rekap_akhir

Unnamed: 0,Year,Total Events,Total +AvgTone,Total -AvgTone,Total Netral AvgTone,AvgTone,+AvgTone,-AvgTone,Weighted AvgTone
0,2003,5272,5257.0,0.0,15.0,5.178416,5.178416,0.0,5.231622
1,2004,3748,3726.0,0.0,22.0,5.633898,5.633898,0.0,5.669425
2,2005,3202,3187.0,2.0,13.0,5.638027,5.639051,-0.001024,5.753622
3,2006,5502,5474.0,6.0,22.0,5.865593,5.868297,-0.002704,5.786742
4,2007,9927,9896.0,6.0,25.0,5.853913,5.856664,-0.002751,5.846894
5,2008,12120,12065.0,6.0,49.0,5.707397,5.709292,-0.001896,5.729746
6,2009,22541,22443.0,0.0,98.0,5.87821,5.87821,0.0,5.707109
7,2010,20164,20074.0,0.0,90.0,5.77241,5.77241,0.0,4.871157
8,2011,26774,26643.0,0.0,131.0,5.928571,5.928571,0.0,5.595367
9,2012,28202,27978.0,0.0,224.0,5.582428,5.582428,0.0,5.260112
