# 

In [None]:
import pandas
import spacy
from ast import literal_eval
import re 
from collections import Counter
import checklist
from checklist.editor import Editor
from checklist.perturb import Perturb
import numpy as np
import openpyxl

nlp = spacy.load('en_core_web_sm')

In [None]:
def read_csv_subject(subject):
    """
    Quick function to retrieve the subject data from csvs. Data will contain both dutch and english. 
    Note that this will always only return the training set
    """
    return pandas.read_csv(f'E:/ArriaThesis/MscThesis/Data/Cleaned_Continuous/{subject}/Train.csv')


def write_to_csv(data, iteration):
    """
    Data = dataset
    Iteration = iteration to determine the dataset split
    data_type = is the dataset RDF or text
    """
    labels = ['Train', 'Dev', 'Test']

    if isinstance(data, pandas.DataFrame):
        print(f'data is a dataframe')

    try:
        print("Entered try check")
        if (os.path.exists(f'E:/ArriaThesis/MscThesis/Data/Augmented_data/{labels[iteration]}') == False) :
            save_path = 'E:/ArriaThesis/MscThesis/Data/Augmented_data/' 
            name_of_file = f'{labels[iteration]}'
            output_file = os.path.join(save_path,f'{name_of_file}.csv')
            data.to_csv(output_file, index=False, encoding = 'utf-8')
    except:
        print(f'file for {labels[iteration]} already exists')


def filter_on_attribute_in_input(data, AttributeName):
    """
    Filters a dataframe based on a chosen attribute
    Input: Dataframe, chosen attribute name
    Returns filtered dataframe
    """
    return data[data['input'].str.contains(f"{AttributeName}")]


def prepare_attribute_values(data_list):
    """
    Gets the valeus that correspond with the chosen attribute. This list is used when changing out the data
    Input: Dataframe input containing string of rdf values
    Returns a dataframe of inputs
    """
    
    RDF = []
    for rdf_iteration, rdf in enumerate(data_list):
        RDF.append(literal_eval(rdf))

    return RDF


def get_attribute_values(data, chosen_attribute):
    data = prepare_attribute_values(data)

    stored_values = []

    for element in data:
        for attribute in element:
            if chosen_attribute in attribute:
                split_attributes = attribute.split(' | ')
                value = split_attributes[1]
                if "'" not in value:
                    stored_values.append(value)

    return set(stored_values)


def prepare_data_for_checklist(data):
    """
    Function that transforms the data so that checklist methods can be used.
    Input: Dataframe with columns input and output
    Returns: An input and an output dataframe
    """
    data_input = list(nlp.pipe(data['input']))
    data_output = list(nlp.pipe(data['output']))
    return data_input, data_output

def calculate_difference(highest_value, current_value):
    """
    Function that calculates how much more data is needed to be augmented to reach the cap of augmentation    
    """
    if isinstance(highest_value, int) and isinstance(current_value, int):
        return highest_value - current_value
    else:
        return ValueError 


def retrieve_and_change_values(data_input, data_texts, attribute_values, desired_additional_data_amount, chosen_attribute_count):
    potential_values = attribute_values

    amount_to_be_created = calculate_difference(desired_additional_data_amount, chosen_attribute_count)

    clean_df = pandas.DataFrame({'input' : [], 'output': []})


    while (len(clean_df['input']) < amount_to_be_created):
        for text_id, text in enumerate(data_texts):
            for attributes in potential_values:
                if '(' in attributes:
                    attributes = attributes.replace('(', '')
                if ')' in attributes:
                    attributes = attributes.replace(')', '')
                if '_' in attributes:
                    attributes = attributes.replace('_', ' ')
                   
                try: 
                    if re.search(r'\b%s\b' % attributes, text.text):
                        
                        for swapping_value in potential_values:
                            # if the value is not found in the text, create a new text with the different value
                            if swapping_value != attributes:
                                # this REGEX reads as find the exact attribute in the input text that corresponds with the looked at text and replace it with the chosen swapping value
                                new_input = re.sub(r'\b%s\b' % attributes, swapping_value, data_input[text_id].text)
                                
                                text_appropriate_value = swapping_value.replace('_', ' ')
                                # this REGEX reads as find the exact attribute in the text.text and replace it with text_appropriate_value
                                new_text = re.sub(r'\b%s\b' % attributes, text_appropriate_value, text.text)
                                
                                full_new_input = [new_input, new_text]

                                # make sure we do not create too much data
                                if len(clean_df['input']) < amount_to_be_created:
                                    clean_df.loc[len(clean_df)] = full_new_input
                                else:
                                    break
                except:
                    print(f'text.text   {text.text}')
                    print(f'attributes   {attributes}')

    return clean_df
 

def get_attribute_count_from_dataframe(dataframe):
    prep_list = []
    total_attributes = []

    for element_id, element in enumerate(dataframe.loc[:, 'input']):
        prep_list.append(literal_eval(element))

    dataframe['check'] = prep_list

    for inputs in dataframe['check']:
        for attributes in inputs:
            split_attributes = attributes.split(' | ')
            total_attributes.append(split_attributes[0])
   

    return Counter(total_attributes)
    
def subject_data_addition(attribute_lists, subject, expected_data_amount):
    subject_df = read_csv_subject(subject)

    subject_df_expanded = subject_df.copy(deep=True)
    # input contains several attributes, so starting with lowest occuring attribute will lower the chance of a data blow-upp
    
    for language_split_list in attribute_lists:

        language_split_list.reverse()

        for attribute in language_split_list:
            subject_df_filtered = subject_df[subject_df["input"].str.contains(f'{attribute}')==True]

            current_total_attribute_count = get_attribute_count_from_dataframe(subject_df_filtered)
            current_attribute_count = current_total_attribute_count[attribute]

            values = get_attribute_values(subject_df_filtered['input'], f"{attribute}")

            Prepped_data_input, Prepped_data_output = prepare_data_for_checklist(subject_df_filtered)

            additional_data = retrieve_and_change_values(Prepped_data_input, Prepped_data_output, values, expected_data_amount, current_attribute_count)
            
            subject_df_expanded = subject_df_expanded.append(additional_data, ignore_index=True)
    return subject_df_expanded



## Data Augmentation Training Set
Below are the records identified occurring less than 250 times, thus needing for augmentation

In [None]:
Sport_attributes_NL = ['assistName', 'chanceForType', 'coachName','teamStandings',
'assistType','playerName', 'stadiumPlayed','numberOfPoints','homeAway','matchStreakNumber','hasTiedTeam','matchStreakType','playerNationality','numberOfSeasonGoals',
'numberOfMatchesPlayed','numberOfMatchGoals','tackleRecipientName','tackleGiverName','substituteName','chanceForNumber','twiceYellowName','nextMatchTeam','redCardName','refereeName',
'injuredName','playerAge','disallowedGoalType','injuryType','halfTimeScore','disallowedGoalName','nextMatchHomeAway','suspendedName', 'nextMatchDate','chanceForNationality','formationTeam']

Sport_attributes_Eng = ['RBI', 'hasWonTeam', 'finalScore', 'outNumber', 'hasLostTeam', 'homeRunNumber', 'baseNumber',
 'strikeOutNumber', 'startsNumber', 'pitchResultNumber', 'competitionName', 'walkNumber', 'locationPlayed', 'managerName', 'scoreNumber', 'onBaseNumber', 'pitchType', 'batterHitsTries', 'pitcherSaveRecord', 'ERA', 'teamStandings',
 'injuryType', 'homeAway', 'pitchNumber', 'standingsGames', 'hasScored', 'battingAverage', 'teamRecord', 'earnedRunsNumber', 'throwDirection', 'pitchCount', 'battersFacedNumber', 'pitchesTotalThrown', 'atBatNumber', 'gameTally', 'matchStreakNumber', 
 'battingLineupNumber', 'umpireName', 'catchType', 'winningPercentage', 'matchStreakType', 'umpireType', 'unearnedRunsNumber', 'baseStolen', 'strikeNumber',
 'retireNumber', 'stealNumber', 'baseReachedNumber', 'leftOnBase', 'basesRan', 'catcherName', 'isOut', 'errorNumber', 'numberOfStarts']

Stock_NL = ['stockChangePercentage', 'timePoint', 'exchangeName', 'moneyAmount', 'stockPoints']

Stock_Eng = ['PERCENT', 'exchangeName', 'LOC', 'stockPoints', 'MONEY', 'TICKER', 'ORDINAL']

Incident_NL = ['victimStatus', 'victimGender', 'victimVehicle', 'incidentType', 'location', 'datetime', 'suspectVehicle', 'victimAge', 'victimAddress', 'cause', 'victimDescription',
                'victimAmount', 'suspectGender', 'suspectStatus', 'suspectAge', 'suspectAddress', 'suspectDescription', 'suspectAmount', 'victimName']

Incident_Eng = ['victimNumber', 'victimStatus', 'accidentAddress', 'shootingType', 'accidentDate', 'victimGender', 'victimAge', 'victimName', 'suspectName', 'hospitalName', 'suspectStatus', 'suspectGender', 'suspectAge', 
                'suspectNumber', 'victimBased', 'victimAge_Group', 'victimOccupation', 'numberOfRoundsFired', 'suspectWeapon', 'suspectVehicle', 'suspectBased', 'personnelArrivedTime', 'shootingNumber', 'prisonName',
                 'suspectAge_Group', 'victimRace', 'suspectRace', 'suspectDescription', 'suspectHeight', 'suspectOccupation', 'victimVehicle', 'suspectWeight']

Weather_NL = ['windDirection', 'cloudAmount', 'weatherIntensity', 'weatherFrequency', 'weatherArea', 'minimumTemperature', 'temperatureCelsius', 'maximumTemperature', 'weatherOccurringChance', 'compassDirection', 'windChange', 
                'cloudType', 'cloudChange', 'weatherChange', 'windSpeedBft', 'gustAmount', 'gustVelocity', 'temperatureChange', 'temperatureHotCold', 'precipitationAmount', 'gustChange', 'windType', 'windTurning', 'snowAmount']

Weather_Eng = ['weatherIntensity', 'temperatureCelsius', 'temperatureHotCold', 'windAmount', 'weatherOccurringChance', 'maximumTemperature', 'cloudAmount', 'weatherFrequency', 'temperatureChange', 'minimumTemperature', 'windDirection', 
                'weatherChange', 'windSpeedBft', 'compassDirection', 'snowAmount', 'windChange', 'gustAmount', 'precipitationAmount', 'cloudChange', 'weatherArea', 'gustVelocity', 'gustChange', 'cloudType', 'sunSetTime', 'sunRiseTime']

In [None]:
Weather_added = subject_data_addition([Weather_NL, Weather_Eng], 'Weather', 250) 
Stock_added = subject_data_addition([Stock_NL, Stock_Eng], 'Stocks', 250)
Sport_added = subject_data_addition([Sport_attributes_NL, Sport_attributes_Eng], 'Sports', 250)
Incident_added = subject_data_addition([Incident_NL, Incident_Eng], 'Incidents', 250)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [None]:
full_added_data = Sport_added.append(Weather_added, ignore_index=True)
full_added_data = full_added_data.append(Incident_added, ignore_index=True)
full_added_data_train = full_added_data.append(Stock_added, ignore_index=True)

In [None]:
full_added_data_train = full_added_data_train.drop(columns=['check'])

data is a dataframe
Entered try check


### Augmentation Check and Removal of poor additions

In [None]:
full_count = get_attribute_count_from_dataframe(full_added_data_train)
full_count

Counter({'batterName': 7974,
         'finalScore': 1234,
         'gameTally': 329,
         'hasLostTeam': 820,
         'hasWonTeam': 992,
         'hitNumber': 1421,
         'homeRunNumber': 616,
         'matchDate': 1816,
         'pitchResult': 4953,
         'runNumber': 1429,
         'competitionName': 695,
         'baseReachedNumber': 353,
         'pitchResultNumber': 789,
         'batterHitsTries': 401,
         'strikingType': 1514,
         'teamName': 10671,
         'winLossType': 1689,
         'umpireName': 565,
         'umpireType': 464,
         'locationPlayed': 616,
         'inningNumber': 3282,
         'pitcherName': 5923,
         'scoreNumber': 759,
         'scoreTally': 829,
         'strikeTrajectory': 1073,
         'gameNumber': 1605,
         'RBI': 467,
         'outNumber': 1191,
         'presidentName': 1,
         'fielderName': 1010,
         'fielderPosition': 1371,
         'baseNumber': 861,
         'throwDirection': 360,
         'inning

In [None]:
poor_additions = [ 'single','127th','three','87th','eight','nine-RBI','25','doubles','12','299','RBI_singles','83','four','102nd','triple','seven','90','10','109th','groundout','11','one','25th','six',
'singles','100','first','RBI-single','94','102','an','second','15','19','triples','double','13','Seven','31','48','39','two','RBIs','nine','four-RBI','no','five','68']

In [None]:
for element_id, element in enumerate(full_added_data_train['check']):
    for l in element:
        split_attributes = l.split(' | ')
        attribute = split_attributes[0]
        if attribute in poor_additions: 
            full_added_data_train = full_added_data_train.drop(element_id)

In [None]:
write_to_csv(full_added_data_train, 0)

In [None]:
test = pandas.read_csv("E:/ArriaThesis/MscThesis/Data/Augmented_data/Test.csv")

In [None]:
full_count_test = get_attribute_count_from_dataframe(test)

['victimGender | men', 'victimNumber | three', 'victimNumber | two', 'victimStatus | critical_injuries', 'shootingType | shot']
['victimGender | men', 'victimNumber | three', 'victimNumber | two', 'victimStatus | critical_injuries', 'shootingType | shot']
['victimGender | men', 'victimNumber | three', 'victimNumber | two', 'victimStatus | critical_injuries', 'shootingType | shot']
['victimGender | men', 'victimNumber | three', 'victimNumber | two', 'victimStatus | critical_injuries', 'shootingType | shot']
['victimGender | men', 'victimNumber | three', 'victimNumber | two', 'victimStatus | critical_injuries', 'shootingType | shot']
['victimAge_Group | 19_to_21', 'victimGender | men', 'victimStatus | wonded']
['victimAge_Group | 19_to_21', 'victimGender | men', 'victimStatus | wonded']
['victimAge_Group | 19_to_21', 'victimGender | men', 'victimStatus | wonded']
['accidentAddress | Greyhound_bus', 'accidentDate | around_1:30_a.m.', 'shootingType | shooting', 'location | Lebec', 'datetim

In [None]:
full_count_test

Counter({'victimGender': 41,
         'victimNumber': 53,
         'victimStatus': 92,
         'shootingType': 26,
         'victimAge_Group': 8,
         'accidentAddress': 12,
         'accidentDate': 5,
         'location': 49,
         'datetime': 19,
         'suspectStatus': 36,
         'suspectWeapon': 3,
         'ORG': 45,
         'victimName': 22,
         'suspectVehicle': 13,
         'victimAge': 33,
         'takenToHospital': 14,
         'hospitalName': 7,
         'incidentType': 240,
         'cause ': 1,
         'suspectDescription': 7,
         'suspectGender': 12,
         'suspectRace': 1,
         'cause': 46,
         'competitionName': 11,
         'teamName': 74,
         'suspectName': 7,
         'supsectStatus': 1,
         'suspectNumber': 4,
         'personnelArrivedTime': 4,
         'victimBased': 6,
         'suspectOccupation': 1,
         'suspectAge_Group': 1,
         'pitcherName': 15,
         'incedentType': 2,
         'gameTally': 1,
    