Author: Vikhy
<br>
Date: 22 March, 2022

## Data Augmentation
1. Data Expansion
2. Synonym Expansion
3. Phrase Expansion
4. Query Reformulation

In [38]:
import pandas as pd

data = pd.read_csv("Datasets/corrected_tweets.csv")

target = data["Target"]
tweets = data["Tweet"]
stance = data["Stance"]

In [39]:
print(data)

                        Target  \
0                      Atheism   
1                      Atheism   
2                      Atheism   
3                      Atheism   
4                      Atheism   
...                        ...   
2809  Legalization of Abortion   
2810  Legalization of Abortion   
2811  Legalization of Abortion   
2812  Legalization of Abortion   
2813  Legalization of Abortion   

                                                  Tweet   Stance  
0     dear lord thank u for all of ur blessing forgi...  AGAINST  
1     Blessed are the peacemaker for they shall be c...  AGAINST  
2     I am not conformed to this world I am transfor...  AGAINST  
3     salad should be prayed with focus and understa...  AGAINST  
4     And stay in your house and do not display ours...  AGAINST  
...                                                 ...      ...  
2809  Theres a law protecting unborn eagle but not h...  AGAINST  
2810  I am 1 in 3 I have had an abortion AbortionOnD...

In [40]:
# Synonym Expansion

from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet


def synonym_expansion(target):
    tokens = word_tokenize(target)
    extended_tokens = []
    for i, token in enumerate(tokens):
        synonyms = []
        for synonym in wordnet.synsets(token):
            for lemma in synonym.lemmas():
                synonyms.append(lemma.name())
                if len(synonyms) >= 3:
                    break
            if len(synonyms) >= 3:
                    break
        if synonyms:
            idx = tokens.index(token)
            # extended_tokens = tokens + [synonyms[1]]
            tokens[idx] = synonyms[2]
            extended_tokens = tokens
    return " ".join(extended_tokens)

extended_target = target.apply(synonym_expansion)
print(extended_target.unique())

['atheism' 'climate modification be A real care'
 "women's_liberationist move" 'Sir_Edmund_Hillary Hilary_Rodham_Clinton'
 'legitimation of abortion']


In [41]:
# Phrase Expansion

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet

nltk.download("wordnet")

def phrase_expansion(target):
    tokens = word_tokenize(target)
    definitions = []
    for i, token in enumerate(tokens):
        synsets = wordnet.synsets(token)
        if synsets:
            synset = synsets[0] # choose the first synset
            definition = synset.definition()
            definitions.append(definition)
        else:
            definition = ""
            definitions.append(definition)
    return " ". join(definitions)


definitions = target.apply(phrase_expansion)
print(definitions.unique())

['the doctrine or belief that there is no God'
 'the weather in some location averaged over some long period of time an event that occurs when something passes from one state or phase to another have the quality of being; (copula, used with an adjective or a predicate noun) a metric unit of length equal to one ten billionth of a meter (or 0.0001 micron); used to specify wavelengths of electromagnetic radiation any rational or irrational number something that interests you because it is important or affects you'
 'a supporter of feminism a change of position that does not entail a change of location'
 'New Zealand mountaineer who in 1953 first attained the summit of Mount Everest with his Sherpa guide Tenzing Norgay (born in 1919) wife of President Clinton and later a woman member of the United States Senate (1947-)'
 'the act of making lawful  termination of pregnancy']


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vikhy\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [42]:
# Query Reformulation

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet

nltk.download("wordnet")

def query_reformulation(tweet):
    words = word_tokenize(tweet)
    reformulated_words = []
    for word in words:
        synsets = wordnet.synsets(word)
        if synsets:
            reformulated_word = synsets[0].lemmas()[0].name()
            reformulated_words.append(reformulated_word)
        else:
            reformulated_words.append(word)
    reformulated_query = " ".join(reformulated_words)
    return reformulated_query

reformulated_tweets = tweets.apply(query_reformulation)
print(tweets)
print(reformulated_tweets)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vikhy\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


0       dear lord thank u for all of ur blessing forgi...
1       Blessed are the peacemaker for they shall be c...
2       I am not conformed to this world I am transfor...
3       salad should be prayed with focus and understa...
4       And stay in your house and do not display ours...
                              ...                        
2809    Theres a law protecting unborn eagle but not h...
2810    I am 1 in 3 I have had an abortion AbortionOnD...
2811    How dare you say my sexual preference is a cho...
2812    Equal right for those born that way no right f...
2813    POTUS seal his legacy i 12 do win The got agen...
Name: Tweet, Length: 2814, dtype: object
0       beloved Godhead thank uracil for all of Ur ble...
1       bless are the conciliator for they shall beryl...
2       iodine americium not conform to this universe ...
3       salad should beryllium pray with focus and und...
4       And stay inch your house and bash not display ...
                              .

In [43]:
data = pd.concat([target, extended_target, definitions, tweets, reformulated_tweets, stance], 
                 keys=["target", "extended_target", "target_definition", "tweets", "reformulated_tweets", "stance"], axis=1)
data.to_csv("Datasets/augmented_tweets.csv", index=False)

# Restructure the data

In [47]:
import pandas as pd

data = pd.read_csv("Datasets/augmented_tweets.csv")
target = data["target"]
extended_target = data["extended_target"]
target_definition = data["target_definition"]
tweets = data["tweets"]
reformulated_tweets = data["reformulated_tweets"]

stance = data["stance"]


tweets_combined = tweets.str.cat(reformulated_tweets, sep=" can be paraphrased as ")
targets_combined = target.str.cat(extended_target, sep=" which is same as ")
targets_definitions_combined = targets_combined.str.cat(target_definition, sep=" which is defined as ")
combined_data = tweets_combined.str.cat(targets_definitions_combined, sep=" is related to ")



final_data = pd.concat([combined_data, stance], keys=["Data", "Stance"], axis=1)
final_data.to_csv("Datasets/restructured_data.csv", index=False)

KeyError: 'extended_target'