# Evidence-based Summarisation and RiskScore


In [1]:
### This adds the path location where we have the src folders stored
import sys, os, argparse
Base_Path = '/mnt/apps/risk_topic_evolution/code'                            # This is the path to code dir
Data_Path = '/mnt/data/risk_topic_evolution'            # This is the path to the data


sys.path.insert(0, Base_Path)
sys.path.insert(0, os.path.join(Base_Path, "src"))

## Interactive Mode Only ##
# OPTIONAL: Load the "autoreload" extension so that code can change
%load_ext autoreload

# OPTIONAL: always reload modules so that as you change code in src, it gets loaded
%autoreload 2


In [None]:
##### First time use - install packages #####
# !pip install pandas
# !pip install sclearn
# !pip install plotly
# !pip install colorlover
# !pip install vaderSentiment
# !pip install nltk
# !pip install langdetect
# !pip install glob2
# !pip install zstandard
# !pip install polyglot
# !pip install wordcloud
# !pip install nbformat
# !pip install LooseVersion
# !pip install pycld2

# ! pip install spellchecker
# import nltk
# nltk.download('stopwords')
# nltk.download('omw-1.4')

In [2]:
##### imports #####
import shutil
import time
import re
import pandas as pd

from plotly import __version__
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)


#### Local Imports ###
from src.models import main_trendy
from src.lib import sc_tasks
from src.lib import sc_tools


### Supress sklearn warnings ###
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

pd.options.mode.chained_assignment = None 


CACHEDIR=/root/.cache/matplotlib
Using fontManager instance from /root/.cache/matplotlib/fontlist-v330.json
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


# Process the data 

In [30]:
##### 1. Read in variables #####
"""
### User variables ###
Platform: The social media platform 

config: Global config passed around the code
Make_Changes: IF false code is run without saving to file
Debug_Mode: This prints extra information when running code
Delete_output_folder: This deletes the output directory. 

### System variables ###
initial_time: Time code starts running
File paths: Define various paths
"""
##### User variables ######
### Social media platform ###
Platform = 'reddit'
# Platform = 'parler'
# Platform = 'twitter'

### define config variables ###
config = {}

### Make changes ###
config['Make_Changes'] = True
# config['Make_Changes'] = False

### Debug mode ###
config['Debug_Mode'] = True
# config['Debug_Mode'] = False

### Delete output folder ###
# config['Delete_output_folder'] = True
config['Delete_output_folder'] = False


##### System variables #####
config['initial_time'] = time.time()

### File path ###
Raw_Path       = os.path.join(Data_Path, "raw")
Processed_Path = os.path.join(Data_Path, "processed")
Twitter_Path   = os.path.join(Data_Path, "Twitter_data")
Reddit_Path    = os.path.join(Data_Path, "Reddit_data")
Parler_Path    = os.path.join(Data_Path, "Parler_data")
Avaliable_Path_Dict = {'twitter': Twitter_Path,
                        'reddit' : Reddit_Path,
                        'parler' : Parler_Path}

the_path = sc_tools.platform_path(Platform, Avaliable_Path_Dict)


### Data filders ###
Path_1raw= os.path.join(the_path, '1raw')
Path_2format= os.path.join(the_path, '2format')
Path_2formattemp= os.path.join(the_path, '2formattemp')
Path_3combined= os.path.join(the_path, '3combined')
Path_4yearlycombined= os.path.join(the_path, '4yearlycombined')


### Delete existing output files ###
if config['Delete_output_folder'] and Platform in ['parler', 'reddit']:
    if os.path.exists(config['output_dir']):
        # shutil.rmtree(config['output_dir'])
        print(Platform + 'files deleted - ' + config['output_dir'])

In [None]:
##### 2. Process the data #####

"""
### Raw data prep ###
Split all raw data into 1+ subfolders in the /1raw directory 
Each subfolder must have format 2format[a-z]. example 2formatb 
Note: This allows for parallel processing using python


### Running the code ###
Enter the unique letter of /1raw/2formattemp/2format? subfolder to run. Example b. This can be done manually or with an argparse 
Repeat for every subfolder. Running in parallel or not.

Run data formating, processing and save data on raw data file
"""

### Input Vars - option1 - python argparse ###
# parser = argparse.ArgumentParser(description='ArgParse', formatter_class=argparse.RawTextHelpFormatter)
# parser.add_argument('-r', '--runfolder', default="", help='runfolder')
# args = parser.parse_args()
# runfolder = args.runfolder

### Input Vars - option2 - manual ###
runfolder = 'a'


### Define file path ###
the_path = sc_tools.platform_path(Platform, Avaliable_Path_Dict)                    # Get data path of platform
config["input_dir"] = os.path.join(Path_1raw, runfolder)                         
config['output_dir'] = os.path.join(Path_2formattemp, 'runfolder' + runfolder)     


### Process the data ###
loop_number = 0
for full_file_name in sorted(os.listdir(config['input_dir'])):                      # Loop over raw data files in subfolder
    loop_start_time = time.time()
    print(full_file_name)
    if Platform == 'reddit':                                                        
        sc_tasks.format_data_reddit(config, full_file_name, loop_number, runfolder)  # Run data formating, processing and save data

    elif Platform == 'parler':
        sc_tasks.format_data_parler(config, full_file_name, loop_number, runfolder)  # Run data formating, processing and save data


    loop_number += 1
    print("  Loop Time: " + str(round(time.time() - loop_start_time)) + "s\n")

print("\nTotal Time: " + str(round(time.time() - config['initial_time'])) + "s\n\n")

In [None]:
##### 2b Merge process data #####
'''
Merge .pkl files in different parent directory and same subdir
eg 
Parler_Path/2formattemp/runfoldera/2020/1.pkl
Parler_Path/2formattemp/runfolderb/2020/1.pkl
New file 
Parler_Path/2format/2020/1.pkl

To run in python use 2b_merge_process_data.py

If the previous step uses a single subfolder this step can be manually done by moving the dir from /2formattemp/runfolder? to /2format.
Note: This is not possible if the data is split into multiple subfolders and the extracted data is non chronological. Seen with raw Parler data.
'''

Make_Changes = False
in_dir = Path_2formattemp
output_dir = Path_2format
print(in_dir + " --->>>--- " + output_dir)

count = 0
for dirName, subdirList, fileList in os.walk(in_dir):       # Loop over every file in in_dir
    reldir = dirName.replace(in_dir, '')                    # Remove base directory from in_dir
    reldir2 = re.sub('/runfolder[a-z]', '', reldir)         # Regex substitution to remove /2?format from beginning of path
    print(reldir2)
    
    for fname in fileList:                                  # Loop over each hour file in /year/month/day/hour.pkl
        if os.path.splitext(fname)[1] == '.pkl':            
            count += 1
            file_path_in = os.path.join(dirName, fname)     # Load .pkl file
            dir_out2 = output_dir + reldir2
            file_path_out = os.path.join(dir_out2, fname)   # Merged ouput .pkl file
        
            print('\n' + str(count) + " - " + file_path_in + " -- " + file_path_out + " -- " + dir_out2)

            if Make_Changes:
                os.makedirs(dir_out2, exist_ok=True)                            # Make folders that do not exist
                if not os.path.isfile(file_path_out):                           # If file doesnot exist
                    shutil.copyfile(file_path_in, file_path_out)                # Copy file

                else:                                                           # If file exists 
                    sc_tools.merge_dataset_save(file_path_out, file_path_in, file_path_out)  # Load file and merge data and ovewrite .pkl
    
print("Files: " + str(count))

In [None]:
##### 3. Merge and create datasets #####

"""
Read in Event_Date_Data.csv 
event, startDate, endDate  
Note: Dates are the search window around the event

List keywords / hastags for an event 

Merge hourly datasets into daily, filter by presence of keywords, creating subsample

Merge daily datasets into yearly .pkl file
"""

### Load config ###
csvlist = sc_tools.get_csv(os.path.join(Base_Path, 'data', 'raw','Event_Date_Data.csv'))    #Eventlist with start and end dates

eventexist = []
eventnotexist = []
for event_row in csvlist:
    eventname = event_row['event']
    ### CapitalRiots ###
    if eventname == 'CapitalRiots':
        keyword_list = ['white', 'riot', 'fraud', 'freedom', 'election', 'country', 'wwg1wga', 'biden', 'trump', 'office', 'president',
                        'pedophile', 'conspiracy', 'better', 'great', 'again', 'democrat', 'true', 'evil', 'traitor']                                                       # Add words as required 
    
    ### Brexit 1/5/2018 - 31-12/2019 ###
    elif eventname == 'Brexit':
        keyword_list = ['brexit', 'referndum', 'EU', 'vote', 'time', 'labour', 'country', 'parliament', 'britain', 'party', 
                        'ireland', 'tory', 'shambles', 'deal', 'stop', 'post', 'hard', 'today']

    ### HongKongProtests  1/2/2019 - 30/7/2019 ###
    elif eventname == 'HongKongProtests':
        keyword_list = ['anti', 'extradition', 'law', 'admendment', 'bill', 'movement', 'hong', 'kong', 'protest', 'autonomous', 
                        'one', 'country', 'two', 'system', 'capitalist', 'economy', 'china', 'bejing', 'legislative', 'boycott', 
                        'decentralise', 'decentralize', 'chinese', 'umbrella', 'movement']

    ### Indonesia 1/8/2019 - 31/12/2019 ###
    elif eventname == 'Indonesia':
        keyword_list = ['extramarital', 'sex', 'defamination', 'president', 'corruption', 'president', 'kendari', 'student', 'violence',
                        'bill', 'mining', 'land', 'labour', 'correctional', 'rkuhp', 'ruu kpk', 'ruu pks', 'revise', 'investigate', 
                        'eradication', 'commision', 'law']

    ### BlackLivesMatter 1/4/2020 - 31/12/2020 ###
    elif eventname == 'BlackLivesMatter':
        keyword_list = ['racisim', 'police', 'brutality', 'violence', 'BLM', 'George', 'Floyd', 'Trayvon', 'Martin', 'defund', 'murder', 
                        'Derek', 'Chauvin', 'I', 'cant', 'breathe', 'second', 'degree', 'dont', 'shoot', 'brother', 'blue', 'commitment',
                        'knee', 'neck', 'off', 'reform', 'black', 'lives', 'matter']  

    ### Lebanon 1/9/2019 - 31/12/2020 ###
    elif eventname == 'Lebanon':
        keyword_list = ['tax', 'gasoline', 'tobacco', 'sectarian', 'rule', 'stagnation', 'economy', 'unemployment', 'endemic', 'corruption', 
                        'beruit', 'explosion',  'red', 'october', 'revolution', 'rule', 'governemnt', 'prime', 'minister', 'saad', 'hariri', 
                        'resign', '72 hours']

    else:
        print('No keywords for event')
        eventnotexist.append(eventname)         # If queried event does not exist append to eventnotexist list
        continue

    eventexist.append(eventname)                # Append event to eventexist list

    ### Define file path ###

    input_dir = Path_2format
    output_dir = os.path.join(Path_3combined, eventname)

    input_dirY = os.path.join(Path_3combined, eventname)
    output_dirY = os.path.join(Path_4yearlycombined, eventname)


    ### Merge hourly datasets into daily, creating subsample ###
    sc_tasks.create_sampled_dataset_daily(config, input_dir, output_dir, keyword_list, sample_size=5000 )

    ### Create yearly dataset, merge each day ###
    sc_tools.merge_daily_to_year_file(config, input_dirY, output_dirY)

print('Created yearly datasets based on keywords')
print('\n\nSuccess: ' + str(eventexist))
print('\nEvent doesnt exist: ' + str(eventnotexist))


In [35]:
##### Convert GSREvents csv to pkl #####
'''
Manual step to make GSREvents
Convert date of event with event occuring and risk of event to dictionary and save as pkl
date          event        severity
dd/mm/yyyy    1            0
type(date)    type(int)    type(int)

Note: Each event has its own file
'''

Raw_Path1 = os.path.join(Base_Path, 'data', 'raw')
Processed_Path1 = os.path.join(Base_Path, 'data', 'processed')

### Option1 - Split groundtruth events by event group. This is to calculate classification performance ### 
event = 'Brexit'
file_name = 'GSREvents_'  + event

### Option2 - All civil unrest events ###
# file_name = 'GSREvents'

sc_tools.convert_events_csv_to_pkl(Raw_Path1, Processed_Path1, file_name)

In [72]:
##### 4. Run topic analysis #####
"""
Read in Event_Date_Data.csv 
event, startDate, endDate  
Note: Dates are the search window around the event
Loop through each event 


"""

### Run model with new data ###
csvlist = sc_tools.get_csv(os.path.join(Base_Path, 'data', 'raw','Event_Date_Data.csv'))

for event_row in csvlist:
    eventname = event_row['event']
    startDate = sc_tools.convert_date(event_row['startDate'], 'dateslash')
    endDate   = sc_tools.convert_date(event_row['endDate'], 'dateslash')
    

    ### Change end dates depending when datasets ###
    if eventname == 'CapitalRiots' and Platform == 'parler':
        endDate = sc_tools.convert_date('2021-01-11', 'dateline')
        print('---Change Parler CapitalRiots end date---')

    elif eventname == 'Brexit' and Platform == 'reddit':
        startDate = sc_tools.convert_date('2018-10-01', 'dateline')
        print('---Change reddit Brexit end date---')

    elif eventname == 'Brexit' and Platform == 'parler':
        startDate = sc_tools.convert_date('2019-01-01', 'dateline')
        print('---Change parler Brexit end date---')


    print(eventname)
    print(startDate)
    print(endDate)


    ### Paths and files ###
    analysisTitle = Platform + '_' + eventname + '_batchTopicAnalysis'   
    gsrEvSrc  = os.path.join(Base_Path, 'data', 'processed', 'GSREvents_' + eventname + '.pkl') 
    gvFolderLoc1  = os.path.join(Base_Path, "models", "OutputResults_batch" )
    fileout_name = os.path.join(Platform + '_' + eventname + '_' + str(startDate) + '_' + str(endDate) + '.pkl')       # Name of output file
    
    ### Get list of avaliable datasets ###
    yearly_data_path = os.path.join(Path_4yearlycombined, eventname)    
    yearly_datasets_avaliable = [os.path.splitext(filename)[0] for filename in os.listdir(yearly_data_path)]

    ### Append required yearly datasets ###
    allowed_dates = [y for y in range(startDate.year, endDate.year + 1)]
    temp_data = []
    for yearloop in allowed_dates:
                
        year_file = os.path.join(yearly_data_path,  str(yearloop) + '.pkl')                                         # ID yearly file to use
        print(year_file)
        newDF = sc_tools.get_pkl(year_file)
        temp_data.append(newDF)        

    df = pd.concat(temp_data, ignore_index=True)


    ### Identify missing dates within the dataset ###
    missing_dates = pd.date_range(start=startDate, end=endDate).difference(df['publicationDate'])                   # Identify any dates that are missing from the range
    
    ### Interigation of the inputdata ###
    delta = endDate  - startDate 
    daysused = df.loc[(df['publicationDate']>startDate)  & (df['publicationDate'] <= endDate)]
    print('number of days: ' + str(round(delta.days + 1,0)))
    print('Number of rows in data: ' + str(len(daysused)))
    print('Average number of text per day: ' + str(round(len(daysused) / (delta.days + 1), 0)))
    print('Av text length: ' + str(round(daysused['text'].apply(len).mean(),0)))


    ### Flag on whether to run analysis or load from file ### 
    preload_dailycal = True

    if list(missing_dates):                                                                                         # If any dates identified as mssing then use this statement
        print('---- Error missing date ----') 
        print(missing_dates)                                                                                        # Print dates that are missing


    elif preload_dailycal:

        ### Define threshold of riskscore which corresponds to an event ###
        threshold = 0.4
        ### Use this list to get a precisoin/recall plot to find best threshold ###
        ## note this is currently not working
        # step = 0.1
        # threshold = list(np.arange(0, 1 + step, step))

        if os.path.exists(os.path.join(gvFolderLoc1, 'GlobalVariables', fileout_name)):            # if file topic file does exist
            print('\n\n\n##### Code running - Preload=True, Update=True #####')
            gvFolderLoc1  = os.path.join(Base_Path, "models", "OutputResults_batch" )
            trend_ret = main_trendy.trendy(dataSource=df, processing="batch", updating=True, startDate=startDate, endDate=endDate,
                        gsrEvents=gsrEvSrc, analysisTitle=analysisTitle, fileout_name=fileout_name, visualize=True, 
                        FolderLoc=gvFolderLoc1, preload_dailycal=True, threshold=threshold)
            break
            if trend_ret == None:                   # If output returns none due to failure continue to next loop item
                continue
        

        else:                                       # If topic file does not exist run with update=False 
            print('\n\n\n##### Code running - Preload=True, Update=False #####')
            trend_ret = main_trendy.trendy(dataSource=df, processing="batch", updating=False, startDate=startDate, endDate=endDate,
                            gsrEvents=gsrEvSrc, analysisTitle=analysisTitle, fileout_name=fileout_name, visualize=True, 
                            FolderLoc=gvFolderLoc1, preload_dailycal=False)
            
            if trend_ret == None:                   # If output returns none due to failure continue to next loop item
                continue            
           

    else:                                           # This will run when preload=False                                                                                                
        ### Run model ###
        print('\n\n\n##### Code running - Preload=False #####')
        trend_ret = main_trendy.trendy(dataSource=df, processing="batch", updating=True, startDate=startDate, endDate=endDate,
                            gsrEvents=gsrEvSrc, analysisTitle=analysisTitle, fileout_name=fileout_name, visualize=True, 
                            FolderLoc=gvFolderLoc1, preload_dailycal=False)

        if trend_ret == None:                        # If output returns none due to failure continue to next loop item
            continue

---Change Parler CapitalRiots end date---
CapitalRiots
2020-12-01
2021-01-11
/mnt/data/Parler_data/4yearlycombined/CapitalRiots/2020.pkl
/mnt/data/Parler_data/4yearlycombined/CapitalRiots/2021.pkl
number of days: 42
Number of rows in data: 202083
Average number of text per day: 4812.0
Av text length: 90.0



##### Code running - Preload=True, Update=True #####
Preload updateGV
Preloading dailyCal data
leadTimePredict: 9  - vnoOccurrence: 2  - vtweetrateCutOff: 20  - normConst: 21


Threshold: 0.4
TP: 0 - FN: 1 - FP: 2 - TN: 39 - Sum: 42 - All: 42 - Eventcount: 1
Precison: 0.0
Recall: 0.0
-- FP --
[{'date': datetime.date(2020, 12, 30), 'y5': 0.0, 'probList': 0.4100827227803386}, {'date': datetime.date(2021, 1, 7), 'y5': 0.0, 'probList': 0.4529137075928407}]
-- FN --
[{'date': datetime.date(2021, 1, 6), 'y5': 0.5, 'probList': 0.16275213559428142}]
--- 1.6221380233764648 seconds ---


# WIP

In [None]:
# extract tar files 


file = os.path.join(Twitter_Path, '1raw', 'archiveteam-twitter-stream-2018-04', 'twitter-2018-04-25.tar')


sc_tools.extract_tar(file)

In [None]:

### try to open xz file. currently not working

import lzma
data = os.path.join(Reddit_Path, '1raw','RC_2018-10.xz')

# data1 = lzma.LZMADecompressor(data)
# data1.head()
import lzma
with lzma.open(data) as f:
    file_content = f.read()