# 3. merge toxicity data with source dataset. 

This workbook merges the toxicity classifications produces in 2_classify.py with the original messaging/post dataset.

- **INPUTS:** the following input files must be in the data directory:
    - `tox.json` (toxicity classification data output from `2_classify.py`)
    - `source.csv` (original timeseries dataset input to `1_prepare.ipynb`)
    
- **OUTPUTS:** this workbook saves the following files in the data directory:
    - `merged.csv` (merged toxicity classification and timeseries dataset)

## setup:

In [None]:
#dependencies:
import os
import json
import ndjson
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

#data directory:
datadir = os.getcwd()+'/../../data/'

#declare name of toxicity classifications file:
toxicity_file = 'tox.json'

#declare name of source file (i.e, messaging or post data): 
source_file = 'source.csv'

## prepare data:

### generate dataframe from toxicity classification data::

In [None]:
#import toxicity data:
classifications = []
with open(datadir+toxicity_file) as f:
        data = ndjson.load(f)
        classifications.extend(data)

#convert to dataframe:
message, toxicity, language, status = [], [], [], []

for classification in classifications:
    
    if type(classification['response']) == dict: #i.e. if retrieval was successful..
        message.append(classification['message']) #append message
        toxicity.append(classification['response']['attributeScores']['TOXICITY']['summaryScore']['value']) #append toxicity score
        language.append(classification['response']['languages']) #append detected language
        status.append('retrieved')
    
    elif type(classification['response']) != dict: #i.e. if retrieval was not successful..
        message.append(classification['message']) #append message
        toxicity.append(float('nan')) #append NaN for toxicity score
        language.append(float('nan')) #append NaN for detected language score
        status.append(classification['response']) #append error message. 
    
    else: print(f'unable to process:{classification}, skipping entry..' )

classifications_ls = list(zip(message, toxicity, language, status)) #zip together these data

classifications_df = pd.DataFrame(classifications_ls, columns=['message','toxicity','language','status']) #form into dataframe (ready to merge)

### merge toxicity data with original messaging data::

In [None]:
#load messages dataset: 
messages_full = pd.read_csv(datadir+source_file) 

#declare the name of the column containing message or post text:
messages_col = 'text'

#merge datasets:
complete = messages_full.merge(classifications_df, how='left', left_on='text', right_on='message') #merge on message text with classifications

#confirm results:
print(f"Data for {classifications_df.shape[0]} classification attempts has been mapped to {complete.shape[0]} messages in full dataset")

### check completeness of classifications:

In [None]:
#how many messages failed to classify?
subset = complete[complete['toxicity'].isna()]
print(f"There are {len(subset)} messages that were not classified")

#how many of those had to text to begin with?
subset = subset[subset['text'].notna()]
print(f"Of these: {len(subset)} messages contained text")

#save local for manual inspection:
subset['text'].to_csv(datadir+'unclassified.csv')

### set time-series index:

In [None]:
#declare name of column containing date:
date_col = 'date'

#(if required) reformat date column (str -> datetime): 
complete['date']= pd.to_datetime(complete[date_col])

#set datetimeindex (time series data):
complete = complete.set_index(date_col)

#sort by date:
complete = complete.sort_index()

### select for required data:

In [None]:
#print column names:
print(complete.columns)

In [None]:
#(optional) specify columns to retain:
keep_cols = ['source', 'type', 'message', 'toxicity', 'language', 'status']

#filter columns, drop all rows missing toxicity score.
data = complete.filter(keep_cols).dropna(subset=['toxicity'])

#(optional) filter for some date range: 
start_date = '2021-01-01'
end_date = '2022-12-31'
data = data.loc[start_date:end_date]

#(optional) filter for entities with some minimum number of (classified) messages:
entity_column_name = 'source'
minimum_classified_count = 1
subset = data.groupby(entity_column_name).filter(lambda x: len(x) >= minimum_classified_count)

#export this data for re-sampling
subset.to_csv(datadir+'merged.csv')

In [None]:
resampled = pd.read_csv(datadir+'resampled.csv')

In [None]:
resampled.head()