In [None]:
## To enable checklist extension ---> jupyter nbextension enable --py --sys-prefix checklist.viewer


In [3]:
import checklist
from checklist.editor import Editor
from checklist.perturb import Perturb

In [2]:
editor = Editor()

In [4]:
ret = editor.template('This is a {adj} movie.', adj=['good', 'great', 'awesome', 'excellent'])
ret.data

['This is a good movie.',
 'This is a great movie.',
 'This is a awesome movie.',
 'This is a excellent movie.']

In [5]:
ret = editor.template('This is a {adj} movie.', adj=['good', 'great', 'awesome', 'excellent'], labels=1,
                      meta=True, save=True)
ret.labels

[1, 1, 1, 1]

In [6]:
editor.add_lexicon('adj', ['good', 'bad', 'great', 'terrible'])

In [7]:
ret = editor.template('{adj} is not the same as {adj2}', remove_duplicates=True)
ret.data[:4]

['bad is not the same as good',
 'great is not the same as good',
 'terrible is not the same as good',
 'good is not the same as bad']

### Masked Language Model Suggestion

In [9]:
ret = editor.template('This is {a:adj} {mask}.', remove_duplicates=True)
ret.data[:5]

NOTE: Redirects are currently not supported in Windows or MacOs.


Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/501M [00:00<?, ?B/s]

  to_pred = torch.tensor(to_pred, device=self.device).to(torch.int64)


['This is a good idea.',
 'This is a good example.',
 'This is a good sign.',
 'This is a good thing.',
 'This is a good one.']

In [10]:
ret = editor.template('This is {a:adj} {mask} {mask}.', remove_duplicates=True)
ret.data[:5]

['This is a good history lesson.',
 'This is a good news story.',
 'This is a good chess move.',
 'This is a good programming language.',
 'This is a good chess piece.']

In [11]:
editor.suggest('This is {a:adj} {mask}.')[:5]

['idea', 'example', 'sign', 'thing', 'one']

In [12]:
editor.suggest('This is {a:adj} {mask} {mask}.')[:5]

[('history', 'lesson'),
 ('news', 'story'),
 ('chess', 'move'),
 ('programming', 'language'),
 ('chess', 'piece')]

In [None]:
# editor.visual_suggest('This is {a:mask} movie.') --> This gets a visual representation, but ruins notebook coloring

### Wordnet

In [8]:
editor.synonyms('My drink is hot.', 'hot')

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


Moving 9 files to the new cache system


  0%|          | 0/9 [00:00<?, ?it/s]

NOTE: Redirects are currently not supported in Windows or MacOs.
  to_pred = torch.tensor(to_pred, device=self.device).to(torch.int64)


['spicy', 'raging']

In [15]:
editor.synonyms('John gave an assist.', 'assist')

['aid', 'assistance', 'attend']

In [10]:
editor.antonyms('My drink is hot.', 'hot')

['cold']

In [11]:
editor.antonyms('It is hot outside.', 'hot')

['cold']

In [12]:
editor.hypernyms('My dog eats other animals.', 'dog')

['animal']

In [13]:
editor.hyponyms('My animal eats other animals.', 'animal')[:5]

['dog', 'pet', 'baby', 'puppy', 'kitten']

In [14]:
# Hyponyms of hypernym
editor.related_words('My dog eats other animals.', 'dog')[:5]

['pet']

In [3]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [21]:
data = ['John is a very smart person, he lives in Ireland.',
        'Mark Stewart was born and raised in Chicago',
        'Luke Smith has 3 sisters.',
        'Mary is not a nurse.',
        'Julianne is an engineer.',
        'My brother Andrew used to be a lawyer.']

In [22]:
pdata = list(nlp.pipe(data))

In [32]:
pdata

[John is a very smart person, he lives in Ireland.,
 Mark Stewart was born and raised in Chicago,
 Luke Smith has 3 sisters.,
 Mary is not a nurse.,
 Julianne is an engineer.,
 My brother Andrew used to be a lawyer.]

In [23]:
ret = Perturb.perturb(pdata, Perturb.change_names, nsamples=1, n=3, last_only=True, meta=True)
ret.data[0][1:], ret.meta[0][1:]

(['Mark King was born and raised in Chicago',
  'Mark Thomas was born and raised in Chicago',
  'Mark Adams was born and raised in Chicago'],
 [('Stewart', 'King'), ('Stewart', 'Thomas'), ('Stewart', 'Adams')])

In [39]:
ret = Perturb.perturb(pdata[1:4], Perturb.change_names, nsamples=6, n=4, last_only=True, meta=True)
#ret.data[0][1:], ret.meta[0][1:]
ret.data

[['Mark Stewart was born and raised in Chicago',
  'Mark Johnson was born and raised in Chicago',
  'Mark Bennett was born and raised in Chicago',
  'Mark Moore was born and raised in Chicago',
  'Mark Mitchell was born and raised in Chicago'],
 ['Luke Smith has 3 sisters.',
  'Luke Moore has 3 sisters.',
  'Luke Powell has 3 sisters.',
  'Luke King has 3 sisters.',
  'Luke Wright has 3 sisters.']]

In [216]:
goalName_list = ['Namens Roda mikte Mike van Duinen raak.',
                'Vitessenaar Zhang mikte raak en bezorgde zijn club de zege: 1-2.',
                "Guus Til lukte dat wel.", 
                "Mateusz Klich wist er goed gebruik van te maken en scoorde vervolgens de enige treffer van de wedstrijd.",
                "Op 20 oktober was die ploeg in Alkmaar nog met 1-2 te sterk voor AZ, ook toen scoorde Mühren.",
                "De bezoekers kwamen onverwacht op voorsprong door een schot van Jürgen Locadia",
                "Youness Mokhtar was met twee doelpunten de gevierde man",
                "Niet veel later was het Ziyech die Ajax in veilige haven bracht.",
                "Nicolai Brock-Madsen kopte raak.",
                "Richairo Zivkovic en Nacer Barazite waren trefzeker in de Galgenwaard.",
                "Robbie Haemhouts besliste het duel.",
                "Aan de andere kant deed de gelegenheidsspits Brown dat wel bij Roda-doelman Benjamin van Leer.", 
                ]

To_be_switched = ["Mike van Duinen", "Zhang", "Guus Til", "Mateusz Klich", "Mühren", "Jürgen Locadia", "Youness Mokhtar", "Ziyech", "Nicolai Brock-Madsen", "Richairo Zivkovic", "Robbie Haemhouts" ,"Brown"]

positions = ["['goalName | Mike van Duinen', 'teamName | Roda']", "['finalScore | 1-2', 'goalName | Zhang', 'teamName | Vitessenaar']", "['goalName | Guus Til', 'hasScored | True']", "['goalName | Mateusz Klich', 'goalScore | enige_treffer']",
            "['finalScore | 1-2', 'goalName | Mühren', 'hasLostTeam | AZ', 'matchDate | 20_oktober', 'stadiumPlayed | Alkmaar']", "['goalName | Jürgen Locadia', 'goalScore | voorsprong', 'goalType | schot', 'teamName | bezoekers']", 
            "['goalName | Youness Mokhtar', 'numberOfMatchGoals | twee', 'stadiumPlayed | hoofdstad_van_Overijssel']", "['goalName | Ziyech', 'goalScore | in_veilige_haven', 'teamName | Ajax']", "['goalName | Nicolai Brock-Madsen', 'goalType | kopte']",
            "['goalName | Nacer Barazite', 'goalName | Richairo Zivkovic', 'stadiumPlayed | Galgenwaard']", "['goalName | Robbie Haemhouts', 'goalScore | besliste']", "['goalName | Brown', 'goalkeeperName | Benjamin_van_Leer', 'positionOfPlayer | gelegenheidsspits']" ]

In [4]:
import re 

def create_new_data(original_texts, found_names):
    """
    Create additional records with slightly tweaked values
    """
    new_strings = []

    for text in original_texts:
        # p is names that occur in the match
        for name in found_names:
            if re.search(r'\b%s\b' % name, text):
                new_strings.extend([re.sub(r'\b%s\b' % name, new_name, text) for new_name in found_names if name != new_name])
                #ret_meta.extend([(p, new_name) for new_name in found_names if name != new_name])
    return new_strings

def change_attribute_value_pairs(x, switched_values):
    ret = []
    ret_meta = []

    #print(x)

    # Pairings are the attribute value pairs
    for pairings in x:
        #print(pairings)
        # p is names that occur in the match
        for p in switched_values:
            #print(f"p   {p}         pairings    {pairings}")
            if re.search(r'\b%s\b' % p, pairings):
                #print("MATCH")
                ret.extend([re.sub(r'\b%s\b' % p, p2, pairings) for p2 in switched_values if p != p2])
                ret_meta.extend([(p, p2) for p2 in switched_values if p != p2])
    return ret

def create_full_data(attribute_value_pair, changed_texts):
    full_list = []
    for text_id, texts in enumerate(changed_texts):

        #print(len(attribute_value_pair), len(changed_texts))
        # print(texts)
        # print(text_id)
        # print(attribute_value_pair[text_id])
        combined_data = (attribute_value_pair[text_id], texts)
        full_list.append(combined_data)
    return full_list

In [226]:
extra_texts = create_new_data(goalName_list, To_be_switched)

check = change_attribute_value_pairs(positions, To_be_switched)
full_listing = create_full_data(check, extra_texts)
full_listing

# New Tests

In [16]:
import pandas
import spacy
from ast import literal_eval
import re 
from collections import Counter
import checklist
from checklist.editor import Editor
from checklist.perturb import Perturb
import numpy as np
import openpyxl

nlp = spacy.load('en_core_web_sm')

In [199]:
def read_csv_subject(subject):
    """
    Quick function to retrieve the subject data from csvs. Data will contain both dutch and english. 
    Note that this will always only return the training set
    """
    return pandas.read_csv(f'E:/ArriaThesis/MscThesis/Data/Cleaned_Continuous/{subject}/Train.csv')

def filter_on_attribute_in_input(data, AttributeName):
    """
    Filters a dataframe based on a chosen attribute
    Input: Dataframe, chosen attribute name
    Returns filtered dataframe
    """
    return data[data['input'].str.contains(f"{AttributeName}")]


def prepare_attribute_values(data_list):
    """
    Gets the valeus that correspond with the chosen attribute. This list is used when changing out the data
    Input: Dataframe input containing string of rdf values
    Returns a dataframe of inputs
    """
    
    RDF = []
    #RDF = pandas.DataFrame()
    for rdf_iteration, rdf in enumerate(data_list):
        #print(isinstance(rdf, str))
        RDF.append(literal_eval(rdf))

    return RDF


def get_attribute_values(data, chosen_attribute):
    data = prepare_attribute_values(data)

    stored_values = []

    # Get the row
    for element in data:
        # loop through the attributes of each row
        for attribute in element:
            if chosen_attribute in attribute:
                split_attributes = attribute.split(' | ')
                value = split_attributes[1]
                if "'" not in value:
                    stored_values.append(value)

    return set(stored_values)


def prepare_data_for_checklist(data):
    """
    Function that transforms the data so that checklist methods can be used.
    Input: Dataframe with columns input and output
    Returns: An input and an output dataframe
    """
    data_input = list(nlp.pipe(data['input']))
    data_output = list(nlp.pipe(data['output']))
    return data_input, data_output


Taken:

- Waardes kappen, zodat ik niet oneindig veel krijg
- problemen bij kappen, ik zal nooit perfecte hoeveelheden krijgen --> mss bij laagste beginnen en dan omhoog werken?

In [146]:
def calculate_difference(highest_value, current_value):
    """
    Function that calculates how much more data is needed to be augmented    
    """
    if isinstance(highest_value, int) and isinstance(current_value, int):
        return highest_value - current_value
    else:
        return ValueError 


def retrieve_and_change_values(data_input, data_texts, attribute_values, desired_additional_data_amount, chosen_attribute_count):
    # create a clean dataframe
    potential_values = attribute_values

    amount_to_be_created = calculate_difference(desired_additional_data_amount, chosen_attribute_count)

    clean_df = pandas.DataFrame({'input' : [], 'output': []})


    while (len(clean_df['input']) < amount_to_be_created):
    # Start by looping through the texts, keeping the id in tow
        for text_id, text in enumerate(data_texts):
            # p is names that occur in the match
            #print(text)
            for attributes in potential_values:
                # if attributes
                #print(attributes)
                if '(' in attributes:
                    attributes = attributes.replace('(', '')
                if ')' in attributes:
                    attributes = attributes.replace(')', '')
                #if the attribute is found in the text, we create new strings with swapped values
                if '_' in attributes:
                    attributes = attributes.replace('_', ' ')
                   
                try: 
                    if re.search(r'\b%s\b' % attributes, text.text):
                        
                        for swapping_value in potential_values:
                            # if the value is not found in the text, create a new text with the different value
                            if swapping_value != attributes:
                                # this REGEX reads as find the exact attribute in the input text that corresponds with the looked at text and replace it with the chosen swapping value
                                new_input = re.sub(r'\b%s\b' % attributes, swapping_value, data_input[text_id].text)
                                
                                text_appropriate_value = swapping_value.replace('_', ' ')
                                # this REGEX reads as find the exact attribute in the text.text and replace it with text_appropriate_value
                                new_text = re.sub(r'\b%s\b' % attributes, text_appropriate_value, text.text)
                                
                                full_new_input = [new_input, new_text]

                                # make sure we do not create too much data
                                if len(clean_df['input']) < amount_to_be_created:
                                    # Now add the new data to the created dataframe
                                    clean_df.loc[len(clean_df)] = full_new_input
                                else:
                                    break
                except:
                    #continue
                    print(f'text.text   {text.text}')
                    print(f'attributes   {attributes}')

    return clean_df
 

def get_attribute_count_from_dataframe(dataframe):
    prep_list = []
    total_attributes = []

    for element_id, element in enumerate(dataframe.loc[:, 'input']):
        prep_list.append(literal_eval(element))

    dataframe['check'] = prep_list

    for inputs in dataframe['check']:
        for attributes in inputs:
            
            #print(f'attributes  {attributes}')
            split_attributes = attributes.split(' | ')
            total_attributes.append(split_attributes[0])
   

    return Counter(total_attributes)

## Data Augmentation Training Set

In [4]:
Sport_attributes_NL = ['assistName', 'chanceForType', 'coachName','teamStandings',
'assistType','playerName', 'stadiumPlayed','numberOfPoints','homeAway','matchStreakNumber','hasTiedTeam','matchStreakType','playerNationality','numberOfSeasonGoals',
'numberOfMatchesPlayed','numberOfMatchGoals','tackleRecipientName','tackleGiverName','substituteName','chanceForNumber','twiceYellowName','nextMatchTeam','redCardName','refereeName',
'injuredName','playerAge','disallowedGoalType','injuryType','halfTimeScore','disallowedGoalName','nextMatchHomeAway','suspendedName', 'nextMatchDate','chanceForNationality','formationTeam']

Sport_attributes_Eng = ['RBI', 'hasWonTeam', 'finalScore', 'outNumber', 'hasLostTeam', 'homeRunNumber', 'baseNumber',
 'strikeOutNumber', 'startsNumber', 'pitchResultNumber', 'competitionName', 'walkNumber', 'locationPlayed', 'managerName', 'scoreNumber', 'onBaseNumber', 'pitchType', 'batterHitsTries', 'pitcherSaveRecord', 'ERA', 'teamStandings',
 'injuryType', 'homeAway', 'pitchNumber', 'standingsGames', 'hasScored', 'battingAverage', 'teamRecord', 'earnedRunsNumber', 'throwDirection', 'pitchCount', 'battersFacedNumber', 'pitchesTotalThrown', 'atBatNumber', 'gameTally', 'matchStreakNumber', 
 'battingLineupNumber', 'umpireName', 'catchType', 'winningPercentage', 'matchStreakType', 'umpireType', 'unearnedRunsNumber', 'baseStolen', 'strikeNumber',
 'retireNumber', 'stealNumber', 'baseReachedNumber', 'leftOnBase', 'basesRan', 'catcherName', 'isOut', 'errorNumber', 'numberOfStarts']

Stock_NL = ['stockChangePercentage', 'timePoint', 'exchangeName', 'moneyAmount', 'stockPoints']

Stock_Eng = ['PERCENT', 'exchangeName', 'LOC', 'stockPoints', 'MONEY', 'TICKER', 'ORDINAL']

Incident_NL = ['victimStatus', 'victimGender', 'victimVehicle', 'incidentType', 'location', 'datetime', 'suspectVehicle', 'victimAge', 'victimAddress', 'cause', 'victimDescription',
                'victimAmount', 'suspectGender', 'suspectStatus', 'suspectAge', 'suspectAddress', 'suspectDescription', 'suspectAmount', 'victimName']

Incident_Eng = ['victimNumber', 'victimStatus', 'accidentAddress', 'shootingType', 'accidentDate', 'victimGender', 'victimAge', 'victimName', 'suspectName', 'hospitalName', 'suspectStatus', 'suspectGender', 'suspectAge', 
                'suspectNumber', 'victimBased', 'victimAge_Group', 'victimOccupation', 'numberOfRoundsFired', 'suspectWeapon', 'suspectVehicle', 'suspectBased', 'personnelArrivedTime', 'shootingNumber', 'prisonName',
                 'suspectAge_Group', 'victimRace', 'suspectRace', 'suspectDescription', 'suspectHeight', 'suspectOccupation', 'victimVehicle', 'suspectWeight']

Weather_NL = ['windDirection', 'cloudAmount', 'weatherIntensity', 'weatherFrequency', 'weatherArea', 'minimumTemperature', 'temperatureCelsius', 'maximumTemperature', 'weatherOccurringChance', 'compassDirection', 'windChange', 
                'cloudType', 'cloudChange', 'weatherChange', 'windSpeedBft', 'gustAmount', 'gustVelocity', 'temperatureChange', 'temperatureHotCold', 'precipitationAmount', 'gustChange', 'windType', 'windTurning', 'snowAmount']

Weather_Eng = ['weatherIntensity', 'temperatureCelsius', 'temperatureHotCold', 'windAmount', 'weatherOccurringChance', 'maximumTemperature', 'cloudAmount', 'weatherFrequency', 'temperatureChange', 'minimumTemperature', 'windDirection', 
                'weatherChange', 'windSpeedBft', 'compassDirection', 'snowAmount', 'windChange', 'gustAmount', 'precipitationAmount', 'cloudChange', 'weatherArea', 'gustVelocity', 'gustChange', 'cloudType', 'sunSetTime', 'sunRiseTime']

In [170]:
def subject_data_addition(attribute_lists, subject, expected_data_amount):
    subject_df = read_csv_subject(subject)

    subject_df_expanded = subject_df.copy(deep=True)
    # input contains several attributes, so starting with lowest occuring attribute will lower the chance of a data blow-upp
    
    for language_split_list in attribute_lists:

        language_split_list.reverse()

        for attribute in language_split_list:
            #print(f'current attribute   {attribute}')
            subject_df_filtered = subject_df[subject_df["input"].str.contains(f'{attribute}')==True]

            current_total_attribute_count = get_attribute_count_from_dataframe(subject_df_filtered)
            current_attribute_count = current_total_attribute_count[attribute]

        # #     # get the values corresponding with the chosen attribute
            values = get_attribute_values(subject_df_filtered['input'], f"{attribute}")

            Prepped_data_input, Prepped_data_output = prepare_data_for_checklist(subject_df_filtered)

            additional_data = retrieve_and_change_values(Prepped_data_input, Prepped_data_output, values, expected_data_amount, current_attribute_count)
            
            #print(f'before  {len(subject_df_expanded)}')
            subject_df_expanded = subject_df_expanded.append(additional_data, ignore_index=True)
            #print(f'after {len(subject_df_expanded)}')
    return subject_df_expanded

def write_to_csv(data, iteration):
    """
    Data = dataset
    Iteration = iteration to determine the dataset split
    data_type = is the dataset RDF or text
    """
    labels = ['Train', 'Dev', 'Test']

    if isinstance(data, pandas.DataFrame):
        print(f'data is a dataframe')

    try:
        print("Entered try check")
        if (os.path.exists(f'E:/ArriaThesis/MscThesis/Data/Augmented_data/{labels[iteration]}') == False) :
            save_path = 'E:/ArriaThesis/MscThesis/Data/Augmented_data/' 
            name_of_file = f'{labels[iteration]}'
            output_file = os.path.join(save_path,f'{name_of_file}.csv')
            data.to_csv(output_file, index=False, encoding = 'utf-8')
    except:
        print(f'file for {labels[iteration]} already exists')

In [6]:
Weather_added = subject_data_addition([Weather_NL, Weather_Eng], 'Weather', 250) 
Stock_added = subject_data_addition([Stock_NL, Stock_Eng], 'Stocks', 250)
Sport_added = subject_data_addition([Sport_attributes_NL, Sport_attributes_Eng], 'Sports', 250)
Incident_added = subject_data_addition([Incident_NL, Incident_Eng], 'Incidents', 250)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [7]:
full_added_data = Sport_added.append(Weather_added, ignore_index=True)
full_added_data = full_added_data.append(Incident_added, ignore_index=True)
full_added_data_train = full_added_data.append(Stock_added, ignore_index=True)

In [10]:
full_added_data_train = full_added_data_train.drop(columns=['check'])
write_to_csv(full_added_data_train, 0)

data is a dataframe
Entered try check


### Augmentation Check and Removal of poor additions

In [8]:
full_count = get_attribute_count_from_dataframe(full_added_data_train)
full_count

Counter({'batterName': 7974,
         'finalScore': 1234,
         'gameTally': 329,
         'hasLostTeam': 820,
         'hasWonTeam': 992,
         'hitNumber': 1421,
         'homeRunNumber': 616,
         'matchDate': 1816,
         'pitchResult': 4953,
         'runNumber': 1429,
         'competitionName': 695,
         'baseReachedNumber': 353,
         'pitchResultNumber': 789,
         'batterHitsTries': 401,
         'strikingType': 1514,
         'teamName': 10671,
         'winLossType': 1689,
         'umpireName': 565,
         'umpireType': 464,
         'locationPlayed': 616,
         'inningNumber': 3282,
         'pitcherName': 5923,
         'scoreNumber': 759,
         'scoreTally': 829,
         'strikeTrajectory': 1073,
         'gameNumber': 1605,
         'RBI': 467,
         'outNumber': 1191,
         'presidentName': 1,
         'fielderName': 1010,
         'fielderPosition': 1371,
         'baseNumber': 861,
         'throwDirection': 360,
         'inning

In [41]:
poor_additions = [ 'single','127th','three','87th','eight','nine-RBI','25','doubles','12','299','RBI_singles','83','four','102nd','triple','seven','90','10','109th','groundout','11','one','25th','six',
'singles','100','first','RBI-single','94','102','an','second','15','19','triples','double','13','Seven','31','48','39','two','RBIs','nine','four-RBI','no','five','68']

In [48]:
for element_id, element in enumerate(full_added_data_train['check']):
    for l in element:
        split_attributes = l.split(' | ')
        attribute = split_attributes[0]
        if attribute in poor_additions: 
            full_added_data_train = full_added_data_train.drop(element_id)

In [241]:
test = pandas.read_excel("E:/ArriaThesis/MscThesis/Data/Better_input_data/Test_elaborated.xlsx", sheet_name='Test_elongated_subset')

In [182]:
test['input'][0]


for elements_id, elements in enumerate(test['input']):
    if "'" in elements:
        test['input'][elements_id] = elements.replace("'", "")

In [216]:
test['input'][0]

"['victimGender | men', 'victimNumber | three', 'victimNumber | two', 'victimStatus | critical_injuries', 'shootingType | shot']"

In [225]:
check[0]



['victimGender | men',
 'victimNumber | three',
 'victimNumber | two',
 'victimStatus | critical_injuries',
 'shootingType | shot']

In [226]:
for element_id, element in enumerate(check[0]):
    print(element)
    check[0][element_id] = tuple(element)

victimGender | men
victimNumber | three
victimNumber | two
victimStatus | critical_injuries
shootingType | shot


In [None]:
for elements_id, elements in enumerate(test['input']):
    print(elements)
    #test['input'][elements_id] = elements.replace("'", "")

In [242]:
test['input'] = prepare_attribute_values(test['input'])

In [243]:
write_to_csv(test, 1)

data is a dataframe
Entered try check


In [173]:
test = pandas.read_csv("E:/ArriaThesis/MscThesis/Data/Augmented_data/Test.csv")

In [174]:
full_count_test = get_attribute_count_from_dataframe(test)

['victimGender | men', 'victimNumber | three', 'victimNumber | two', 'victimStatus | critical_injuries', 'shootingType | shot']
['victimGender | men', 'victimNumber | three', 'victimNumber | two', 'victimStatus | critical_injuries', 'shootingType | shot']
['victimGender | men', 'victimNumber | three', 'victimNumber | two', 'victimStatus | critical_injuries', 'shootingType | shot']
['victimGender | men', 'victimNumber | three', 'victimNumber | two', 'victimStatus | critical_injuries', 'shootingType | shot']
['victimGender | men', 'victimNumber | three', 'victimNumber | two', 'victimStatus | critical_injuries', 'shootingType | shot']
['victimAge_Group | 19_to_21', 'victimGender | men', 'victimStatus | wonded']
['victimAge_Group | 19_to_21', 'victimGender | men', 'victimStatus | wonded']
['victimAge_Group | 19_to_21', 'victimGender | men', 'victimStatus | wonded']
['accidentAddress | Greyhound_bus', 'accidentDate | around_1:30_a.m.', 'shootingType | shooting', 'location | Lebec', 'datetim

In [175]:
full_count_test

Counter({'victimGender': 41,
         'victimNumber': 53,
         'victimStatus': 92,
         'shootingType': 26,
         'victimAge_Group': 8,
         'accidentAddress': 12,
         'accidentDate': 5,
         'location': 49,
         'datetime': 19,
         'suspectStatus': 36,
         'suspectWeapon': 3,
         'ORG': 45,
         'victimName': 22,
         'suspectVehicle': 13,
         'victimAge': 33,
         'takenToHospital': 14,
         'hospitalName': 7,
         'incidentType': 240,
         'cause ': 1,
         'suspectDescription': 7,
         'suspectGender': 12,
         'suspectRace': 1,
         'cause': 46,
         'competitionName': 11,
         'teamName': 74,
         'suspectName': 7,
         'supsectStatus': 1,
         'suspectNumber': 4,
         'personnelArrivedTime': 4,
         'victimBased': 6,
         'suspectOccupation': 1,
         'suspectAge_Group': 1,
         'pitcherName': 15,
         'incedentType': 2,
         'gameTally': 1,
    