In [2]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [3]:
import json
from tqdm import tqdm
import pandas as pd

# Preparing Datasets for BERT-single

For BERT-single, each `location-pair` will have a different 3-class classification model, thus we need to construct a different training and validation set for each `location-pair`. 

## Preparing Training Sets

In [None]:
with open('/content/drive/My Drive/SentiHood/SentiHood Dataset/sentihood-train.json', 'r') as fp:
    training_set = json.load(fp)

In [None]:
locations = ['LOCATION1', 'LOCATION2']
aspects = ['dining', 'general', 'green-nature', 'live', 'multicultural', 'nightlife', 'price', 'quiet', 'safety', 'shopping', 'touristy', 'transit-location']

for location in locations:
    for aspect in aspects:
        df = pd.DataFrame({'id': [], 'text': [], 'sentiment': []})
        
        ii = 0
        for each_example in training_set:
            id = str(int(each_example['id']))
            text = each_example['text'].strip()
            
            # If `location` is present in the text, only then iterate over the  
            # list of opinions to find suitable `location-aspect` datapoints.

            if location in text:
                aspect_found = False
                
                for opinion in each_example['opinions']:
                    # Checking if the current example contains a sentiment
                    # related to `location-aspect`
                    
                    if opinion['target_entity'] == location and opinion['aspect'] == aspect:
                        df.loc[ii] = [id, text, opinion['sentiment']]
                        aspect_found = True
                        ii += 1
                        break
                
                # If no sentiment is found for `location-asppect` in current 
                # example, then add a datapoint with None.
                
                if not aspect_found:
                    df.loc[ii] = [id, text, 'None']
                    ii += 1

        df.to_csv('/content/drive/My Drive/SentiHood/Bert-single/TrainingData/' + str(location) + str(aspect) + '.csv', index=False)
        print(f"{location}{aspect} DONE!\tLength = {ii}")

LOCATION1dining DONE!	Length = 2977
LOCATION1general DONE!	Length = 2977
LOCATION1green-nature DONE!	Length = 2977
LOCATION1live DONE!	Length = 2977
LOCATION1multicultural DONE!	Length = 2977
LOCATION1nightlife DONE!	Length = 2977
LOCATION1price DONE!	Length = 2977
LOCATION1quiet DONE!	Length = 2977
LOCATION1safety DONE!	Length = 2977
LOCATION1shopping DONE!	Length = 2977
LOCATION1touristy DONE!	Length = 2977
LOCATION1transit-location DONE!	Length = 2977
LOCATION2dining DONE!	Length = 775
LOCATION2general DONE!	Length = 775
LOCATION2green-nature DONE!	Length = 775
LOCATION2live DONE!	Length = 775
LOCATION2multicultural DONE!	Length = 775
LOCATION2nightlife DONE!	Length = 775
LOCATION2price DONE!	Length = 775
LOCATION2quiet DONE!	Length = 775
LOCATION2safety DONE!	Length = 775
LOCATION2shopping DONE!	Length = 775
LOCATION2touristy DONE!	Length = 775
LOCATION2transit-location DONE!	Length = 775


## Preparing Validation Sets

In [None]:
with open('/content/drive/My Drive/SentiHood/SentiHood Dataset/sentihood-dev.json', 'r') as fp:
    validation_set = json.load(fp)

In [None]:
locations = ['LOCATION1', 'LOCATION2']
aspects = ['dining', 'general', 'green-nature', 'live', 'multicultural', 'nightlife', 'price', 'quiet', 'safety', 'shopping', 'touristy', 'transit-location']

for location in locations:
    for aspect in aspects:
        df = pd.DataFrame({'id': [], 'text': [], 'sentiment': []})
        ii = 0
        for each_example in validation_set:
            id = str(int(each_example['id']))
            text = each_example['text'].strip()
            
            # If `location` is present in the text, only then iterate over the  
            # list of opinions to find suitable `location-aspect` datapoints.

            if location in text:
                aspect_found = False
                for opinion in each_example['opinions']:
                    # Checking if the current example contains a sentiment
                    # related to `location-aspect`

                    if opinion['target_entity'] == location and opinion['aspect'] == aspect:
                        df.loc[ii] = [id, text, opinion['sentiment']]
                        aspect_found = True
                        ii += 1
                        break
                
                # If no sentiment is found for `location-asppect` in current 
                # example, then add a datapoint with None.

                if not aspect_found:
                    df.loc[ii] = [id, text, 'None']
                    ii += 1

        df.to_csv('/content/drive/My Drive/SentiHood/Bert-single/ValidationData/' + str(location) + str(aspect) + '.csv', index=False)
        print(f"{location}{aspect} DONE!\tLength = {ii}")

LOCATION1dining DONE!	Length = 747
LOCATION1general DONE!	Length = 747
LOCATION1green-nature DONE!	Length = 747
LOCATION1live DONE!	Length = 747
LOCATION1multicultural DONE!	Length = 747
LOCATION1nightlife DONE!	Length = 747
LOCATION1price DONE!	Length = 747
LOCATION1quiet DONE!	Length = 747
LOCATION1safety DONE!	Length = 747
LOCATION1shopping DONE!	Length = 747
LOCATION1touristy DONE!	Length = 747
LOCATION1transit-location DONE!	Length = 747
LOCATION2dining DONE!	Length = 190
LOCATION2general DONE!	Length = 190
LOCATION2green-nature DONE!	Length = 190
LOCATION2live DONE!	Length = 190
LOCATION2multicultural DONE!	Length = 190
LOCATION2nightlife DONE!	Length = 190
LOCATION2price DONE!	Length = 190
LOCATION2quiet DONE!	Length = 190
LOCATION2safety DONE!	Length = 190
LOCATION2shopping DONE!	Length = 190
LOCATION2touristy DONE!	Length = 190
LOCATION2transit-location DONE!	Length = 190


## Preparing Testing Sets

In [None]:
with open('/content/drive/My Drive/SentiHood/SentiHood Dataset/sentihood-test.json', 'r') as fp:
    testing_set = json.load(fp)

In [None]:
locations = ['LOCATION1', 'LOCATION2']
aspects = ['dining', 'general', 'green-nature', 'live', 'multicultural', 'nightlife', 'price', 'quiet', 'safety', 'shopping', 'touristy', 'transit-location']

for location in locations:
    for aspect in aspects:
        df = pd.DataFrame({'id': [], 'text': [], 'sentiment': []})
        ii = 0
        for each_example in testing_set:
            id = str(int(each_example['id']))
            text = each_example['text'].strip()
            
            # If `location` is present in the text, only then iterate over the  
            # list of opinions to find suitable `location-aspect` datapoints.

            if location in text:
                aspect_found = False
                for opinion in each_example['opinions']:
                    # Checking if the current example contains a sentiment
                    # related to `location-aspect`

                    if opinion['target_entity'] == location and opinion['aspect'] == aspect:
                        df.loc[ii] = [id, text, opinion['sentiment']]
                        aspect_found = True
                        ii += 1
                        break
                
                # If no sentiment is found for `location-asppect` in current 
                # example, then add a datapoint with None.

                if not aspect_found:
                    df.loc[ii] = [id, text, 'None']
                    ii += 1

        df.to_csv('/content/drive/My Drive/SentiHood/Bert-single/TestingData/' + str(location) + str(aspect) + '.csv', index=False)
        print(f"{location}{aspect} DONE!\tLength = {ii}")

# Preparing Datasets for BERT-pair

For all the BERT-pair models, only a single training, validation and testing set need to constructed.

## Datasets for QA-M

For QA-M, an auxiliary sentence in the following form will be constructed for each `location-aspect`.

*what do you think about `aspect` of `location`?*

### Training set for BERT-pair QA-M

In [None]:
with open('/content/drive/My Drive/SentiHood/SentiHood Dataset/sentihood-train.json', 'r') as fp:
    training_set = json.load(fp)

In [None]:
locations = ['LOCATION1', 'LOCATION2']
aspects = ['dining', 'general', 'green-nature', 'live', 'multicultural', 'nightlife', 'price', 'quiet', 'safety', 'shopping', 'touristy', 'transit-location']

df = pd.DataFrame({'id': [], 'text': [], 'auxiliary_sentence': [], 'sentiment': []})
ii = 0

for location in locations:
  for each_example in tqdm(training_set, ncols=80):
    id = str(int(each_example['id']))
    text = each_example['text'].strip()

    # If `location` is present in the text, only then iterate over the  
    # list of opinions to find suitable `location-aspect` datapoints.

    if location in text:
      # current_opinion_aspects: Is a dictionary containing all aspects and  
      # their corresponding sentiment present in the current opinion.
      current_opinion_aspects = {}
      
      for opinion in each_example['opinions']:
        if opinion['target_entity'] == location:
          current_opinion_aspects[opinion['aspect']] = opinion['sentiment']
      
      for aspect in aspects:
        auxiliary_sentence = f'what do you think of the {aspect} of {location}?' 
        
        if aspect in current_opinion_aspects.keys():
          df.loc[ii] = [id, text, auxiliary_sentence, current_opinion_aspects[aspect]]
        else:
          df.loc[ii] = [id, text, auxiliary_sentence, 'None']
        ii += 1
          

100%|██████████| 2977/2977 [02:23<00:00, 20.78it/s]
100%|██████████| 2977/2977 [00:48<00:00, 61.58it/s]


In [None]:
# df.to_csv('/content/drive/My Drive/SentiHood/Bert-pair/QA-M/training_set.csv', index=False)

### Validation set for BERT-pair QA-M

In [None]:
with open('/content/drive/My Drive/SentiHood/SentiHood Dataset/sentihood-dev.json', 'r') as fp:
    validation_set = json.load(fp)

In [None]:
locations = ['LOCATION1', 'LOCATION2']
aspects = ['dining', 'general', 'green-nature', 'live', 'multicultural', 'nightlife', 'price', 'quiet', 'safety', 'shopping', 'touristy', 'transit-location']

df = pd.DataFrame({'id': [], 'text': [], 'auxiliary_sentence': [], 'sentiment': []})
ii = 0

for location in locations:
  for each_example in tqdm(validation_set, ncols=80):
    id = str(int(each_example['id']))
    text = each_example['text'].strip()

    # If `location` is present in the text, only then iterate over the  
    # list of opinions to find suitable `location-aspect` datapoints.

    if location in text:
      # current_opinion_aspects: Is a dictionary containing all aspects and  
      # their corresponding sentiment present in the current opinion.
      current_opinion_aspects = {}
      
      for opinion in each_example['opinions']:
        if opinion['target_entity'] == location:
          current_opinion_aspects[opinion['aspect']] = opinion['sentiment']
      
      for aspect in aspects:
        auxiliary_sentence = f'what do you think of the {aspect} of {location}?' 
        if aspect in current_opinion_aspects.keys():
          df.loc[ii] = [id, text, auxiliary_sentence, current_opinion_aspects[aspect]]
        else:
          df.loc[ii] = [id, text, auxiliary_sentence, 'None']
        ii += 1
          

100%|█████████████████████████████████████████| 747/747 [00:29<00:00, 24.95it/s]
100%|█████████████████████████████████████████| 747/747 [00:08<00:00, 92.77it/s]


In [None]:
# df.to_csv('/content/drive/My Drive/SentiHood/Bert-pair/QA-M/validation_set.csv', index=False)

### Testing set for BERT-pair QA-M

In [None]:
with open('/content/drive/My Drive/SentiHood/SentiHood Dataset/sentihood-test.json', 'r') as fp:
    testing_set = json.load(fp)

In [None]:
locations = ['LOCATION1', 'LOCATION2']
aspects = ['dining', 'general', 'green-nature', 'live', 'multicultural', 'nightlife', 'price', 'quiet', 'safety', 'shopping', 'touristy', 'transit-location']

df = pd.DataFrame({'id': [], 'text': [], 'auxiliary_sentence': [], 'sentiment': []})
ii = 0

for location in locations:
  for each_example in tqdm(testing_set, ncols=80):
    id = str(int(each_example['id']))
    text = each_example['text'].strip()

    # If `location` is present in the text, only then iterate over the  
    # list of opinions to find suitable `location-aspect` datapoints.
    
    if location in text:
      # current_opinion_aspects: Is a dictionary containing all aspects and  
      # their corresponding sentiment present in the current opinion.

      current_opinion_aspects = {}
      for opinion in each_example['opinions']:
        if opinion['target_entity'] == location:
          current_opinion_aspects[opinion['aspect']] = opinion['sentiment']
      
      for aspect in aspects:
        auxiliary_sentence = f'what do you think of the {aspect} of {location}?' 
        if aspect in current_opinion_aspects.keys():
          df.loc[ii] = [id, text, auxiliary_sentence, current_opinion_aspects[aspect]]
        else:
          df.loc[ii] = [id, text, auxiliary_sentence, 'None']
        ii += 1
          

100%|███████████████████████████████████████| 1491/1491 [01:02<00:00, 23.71it/s]
100%|███████████████████████████████████████| 1491/1491 [00:17<00:00, 83.23it/s]


In [None]:
# df.to_csv('/content/drive/My Drive/SentiHood/Bert-pair/QA-M/testing_set.csv', index=False)

## Datasets for NLI-M

For NLI-M, an auxiliary pseudo-sentence is the following form will be constructed for each `location-aspect`.

*`location` - `aspect`* where `location` will be reformed as `location - 1` and `location - 2`.

### Training set for BERT-pair NLI-M

In [None]:
with open('/content/drive/My Drive/SentiHood/SentiHood Dataset/sentihood-train.json', 'r') as fp:
    training_set = json.load(fp)

In [None]:
locations = ['LOCATION1', 'LOCATION2']
aspects = ['dining', 'general', 'green-nature', 'live', 'multicultural', 'nightlife', 'price', 'quiet', 'safety', 'shopping', 'touristy', 'transit-location']

df = pd.DataFrame({'id': [], 'text': [], 'auxiliary_sentence': [], 'sentiment': []})
ii = 0

count_location = 1
for location in locations:
  for each_example in tqdm(training_set, ncols=80):
    id = str(int(each_example['id']))
    text = each_example['text'].strip()

    # If `location` is present in the text, only then iterate over the  
    # list of opinions to find suitable `location-aspect` datapoints.

    if location in text:
      # current_opinion_aspects: Is a dictionary containing all aspects and  
      # their corresponding sentiment present in the current opinion.
      current_opinion_aspects = {}

      for opinion in each_example['opinions']:
        if opinion['target_entity'] == location:
          current_opinion_aspects[opinion['aspect']] = opinion['sentiment']
      
      for aspect in aspects:
        auxiliary_sentence = f"location - {str(count_location)} - {aspect}"
        text = text.replace(location, 'location - ' + str(count_location))
        
        if aspect in current_opinion_aspects.keys():
          df.loc[ii] = [id, text, auxiliary_sentence, current_opinion_aspects[aspect]]
        else:
          df.loc[ii] = [id, text, auxiliary_sentence, 'None']
        ii += 1
    
  count_location += 1

100%|███████████████████████████████████████| 2977/2977 [02:53<00:00, 17.20it/s]
100%|███████████████████████████████████████| 2977/2977 [01:03<00:00, 46.70it/s]


In [None]:
# df.to_csv('/content/drive/My Drive/SentiHood/Bert-pair/NLI-M/Datasets/training_set.csv', index=False)

### Validation set for BERT-pair NLI-M

In [None]:
with open('/content/drive/My Drive/SentiHood/SentiHood Dataset/sentihood-dev.json', 'r') as fp:
    validation_set = json.load(fp)

In [None]:
locations = ['LOCATION1', 'LOCATION2']
aspects = ['dining', 'general', 'green-nature', 'live', 'multicultural', 'nightlife', 'price', 'quiet', 'safety', 'shopping', 'touristy', 'transit-location']

df = pd.DataFrame({'id': [], 'text': [], 'auxiliary_sentence': [], 'sentiment': []})
ii = 0

count_location = 1
for location in locations:
  for each_example in tqdm(validation_set, ncols=80):
    id = str(int(each_example['id']))
    text = each_example['text'].strip()

    # If `location` is present in the text, only then iterate over the  
    # list of opinions to find suitable `location-aspect` datapoints.

    if location in text:
      # current_opinion_aspects: Is a dictionary containing all aspects and  
      # their corresponding sentiment present in the current opinion.
      current_opinion_aspects = {}

      for opinion in each_example['opinions']:
        if opinion['target_entity'] == location:
          current_opinion_aspects[opinion['aspect']] = opinion['sentiment']
      
      for aspect in aspects:
        auxiliary_sentence = f"location - {str(count_location)} - {aspect}"
        text = text.replace(location, 'location - ' + str(count_location))
        
        if aspect in current_opinion_aspects.keys():
          df.loc[ii] = [id, text, auxiliary_sentence, current_opinion_aspects[aspect]]
        else:
          df.loc[ii] = [id, text, auxiliary_sentence, 'None']
        ii += 1
    
  count_location += 1

100%|█████████████████████████████████████████| 747/747 [00:33<00:00, 22.58it/s]
100%|█████████████████████████████████████████| 747/747 [00:09<00:00, 81.97it/s]


In [None]:
# df.to_csv('/content/drive/My Drive/SentiHood/Bert-pair/NLI-M/Datasets/validation_set.csv', index=False)

### Testing set for BERT-pair NLI-M 

In [None]:
with open('/content/drive/My Drive/SentiHood/SentiHood Dataset/sentihood-test.json', 'r') as fp:
    testing_set = json.load(fp)

In [None]:
locations = ['LOCATION1', 'LOCATION2']
aspects = ['dining', 'general', 'green-nature', 'live', 'multicultural', 'nightlife', 'price', 'quiet', 'safety', 'shopping', 'touristy', 'transit-location']

df = pd.DataFrame({'id': [], 'text': [], 'auxiliary_sentence': [], 'sentiment': []})
ii = 0

count_location = 1
for location in locations:
  for each_example in tqdm(testing_set, ncols=80):
    id = str(int(each_example['id']))
    text = each_example['text'].strip()

    # If `location` is present in the text, only then iterate over the  
    # list of opinions to find suitable `location-aspect` datapoints.

    if location in text:
      # current_opinion_aspects: Is a dictionary containing all aspects and  
      # their corresponding sentiment present in the current opinion.
      current_opinion_aspects = {}

      for opinion in each_example['opinions']:
        if opinion['target_entity'] == location:
          current_opinion_aspects[opinion['aspect']] = opinion['sentiment']
      
      for aspect in aspects:
        auxiliary_sentence = f"location - {str(count_location)} - {aspect}"
        text = text.replace(location, 'location - ' + str(count_location))
        
        if aspect in current_opinion_aspects.keys():
          df.loc[ii] = [id, text, auxiliary_sentence, current_opinion_aspects[aspect]]
        else:
          df.loc[ii] = [id, text, auxiliary_sentence, 'None']
        ii += 1
    
  count_location += 1

100%|███████████████████████████████████████| 1491/1491 [01:00<00:00, 24.82it/s]
100%|███████████████████████████████████████| 1491/1491 [00:17<00:00, 85.07it/s]


In [None]:
# df.to_csv('/content/drive/My Drive/SentiHood/Bert-pair/NLI-M/Datasets/testing_set.csv', index=False)

## Datasets for QA-B

For QA-B, 3 auxiliary sentences in the following form will be constructed for each `location-aspect`.

*the polarity of the aspect `aspect` of `location` is `positive|negative|none`*.

### Training set for BERT-pair QA-B

In [None]:
with open('/content/drive/My Drive/SentiHood/SentiHood Dataset/sentihood-train.json', 'r') as fp:
    training_set = json.load(fp)

In [None]:
locations = ['LOCATION1', 'LOCATION2']
aspects = ['dining', 'general', 'green-nature', 'live', 'multicultural', 'nightlife', 'price', 'quiet', 'safety', 'shopping', 'touristy', 'transit-location']

df = pd.DataFrame({'id': [], 'text': [], 'auxiliary_sentence': [], 'sentiment': []})
ii = 0

count_location = 1
for location in locations:
  for each_example in tqdm(training_set, ncols=80):
    id = str(int(each_example['id']))
    text = each_example['text'].strip()

    # If `location` is present in the text, only then iterate over the  
    # list of opinions to find suitable `location-aspect` datapoints.

    if location in text:
      # current_opinion_aspects: Is a dictionary containing all aspects and  
      # their corresponding sentiment present in the current opinion.

      current_opinion_aspects = {}
      for opinion in each_example['opinions']:
        if opinion['target_entity'] == location:
          current_opinion_aspects[opinion['aspect']] = opinion['sentiment']
      
      text = text.replace(location, 'location - ' + str(count_location))
      
      for aspect in aspects:
        aspect_found = False
        
        for polarity in ['Positive', 'Negative']:
          auxiliary_sentence = f"the polarity of the aspect {aspect} of location - {str(count_location)} is {polarity}."
          
          if aspect in current_opinion_aspects.keys() and current_opinion_aspects[aspect] == polarity:
            df.loc[ii] = [id, text, auxiliary_sentence, int(1)]
            ii += 1
            aspect_found = True
          else:
            df.loc[ii] = [id, text, auxiliary_sentence, int(0)]
            ii += 1
        
        # If no sentiment is found for the `location-aspect`, then add the 
        # auxiliary sentence with None to the dataset.
        
        auxiliary_sentence = f"the polarity of the aspect {aspect} of location - {str(count_location)} is None."
        if not aspect_found:
          df.loc[ii] = [id, text, auxiliary_sentence, int(1)]
          ii += 1
        else:
          df.loc[ii] = [id, text, auxiliary_sentence, int(0)]
          ii += 1
    
  count_location += 1

100%|███████████████████████████████████████| 2977/2977 [13:51<00:00,  3.58it/s]
100%|███████████████████████████████████████| 2977/2977 [06:59<00:00,  7.09it/s]


In [None]:
df['sentiment'] = df['sentiment'].astype(int)

In [None]:
# df.to_csv('/content/drive/My Drive/SentiHood/Bert-pair/QA-B/Datasets/training_set.csv', index=False)

### Validation set for BERT-pair QA-B

In [None]:
with open('/content/drive/My Drive/SentiHood/SentiHood Dataset/sentihood-dev.json', 'r') as fp:
    validation_set = json.load(fp)

In [None]:
locations = ['LOCATION1', 'LOCATION2']
aspects = ['dining', 'general', 'green-nature', 'live', 'multicultural', 'nightlife', 'price', 'quiet', 'safety', 'shopping', 'touristy', 'transit-location']

df = pd.DataFrame({'id': [], 'text': [], 'auxiliary_sentence': [], 'sentiment': []})
ii = 0

count_location = 1
for location in locations:
  for each_example in tqdm(validation_set, ncols=80):
    id = str(int(each_example['id']))
    text = each_example['text'].strip()

    # If `location` is present in the text, only then iterate over the  
    # list of opinions to find suitable `location-aspect` datapoints.
    
    if location in text:
      # current_opinion_aspects: Is a dictionary containing all aspects and  
      # their corresponding sentiment present in the current opinion.

      current_opinion_aspects = {}
      for opinion in each_example['opinions']:
        if opinion['target_entity'] == location:
          current_opinion_aspects[opinion['aspect']] = opinion['sentiment']
      
      text = text.replace(location, 'location - ' + str(count_location))
      
      for aspect in aspects:
        aspect_found = False
        
        for polarity in ['Positive', 'Negative']:
          auxiliary_sentence = f"the polarity of the aspect {aspect} of location - {str(count_location)} is {polarity}."
          
          if aspect in current_opinion_aspects.keys() and current_opinion_aspects[aspect] == polarity:
            df.loc[ii] = [id, text, auxiliary_sentence, int(1)]
            ii += 1
            aspect_found = True
          else:
            df.loc[ii] = [id, text, auxiliary_sentence, int(0)]
            ii += 1
        
        # If no sentiment is found for the `location-aspect`, then add the 
        # auxiliary sentence with None to the dataset.

        auxiliary_sentence = f"the polarity of the aspect {aspect} of location - {str(count_location)} is None."
        if not aspect_found:
          df.loc[ii] = [id, text, auxiliary_sentence, int(1)]
          ii += 1
        else:
          df.loc[ii] = [id, text, auxiliary_sentence, int(0)]
          ii += 1
    
  count_location += 1

100%|█████████████████████████████████████████| 747/747 [01:53<00:00,  6.60it/s]
100%|█████████████████████████████████████████| 747/747 [00:36<00:00, 20.61it/s]


In [None]:
df['sentiment'] = df['sentiment'].astype(int)

In [None]:
# df.to_csv('/content/drive/My Drive/SentiHood/Bert-pair/QA-B/Datasets/validation_set.csv', index=False)

### Testing set for BERT-pair QA-B 

In [None]:
with open('/content/drive/My Drive/SentiHood/SentiHood Dataset/sentihood-test.json', 'r') as fp:
    testing_set = json.load(fp)

In [None]:
locations = ['LOCATION1', 'LOCATION2']
aspects = ['dining', 'general', 'green-nature', 'live', 'multicultural', 'nightlife', 'price', 'quiet', 'safety', 'shopping', 'touristy', 'transit-location']

df = pd.DataFrame({'id': [], 'text': [], 'auxiliary_sentence': [], 'sentiment': []})
ii = 0

count_location = 1
for location in locations:
  for each_example in tqdm(testing_set, ncols=80):
    id = str(int(each_example['id']))
    text = each_example['text'].strip()

    # If `location` is present in the text, only then iterate over the  
    # list of opinions to find suitable `location-aspect` datapoints.

    if location in text:
      # current_opinion_aspects: Is a dictionary containing all aspects and  
      # their corresponding sentiment present in the current opinion.

      current_opinion_aspects = {}
      for opinion in each_example['opinions']:
        if opinion['target_entity'] == location:
          current_opinion_aspects[opinion['aspect']] = opinion['sentiment']
      
      text = text.replace(location, 'location - ' + str(count_location))
      
      for aspect in aspects:
        aspect_found = False
        
        for polarity in ['Positive', 'Negative']:
          auxiliary_sentence = f"the polarity of the aspect {aspect} of location - {str(count_location)} is {polarity}."
          
          if aspect in current_opinion_aspects.keys() and current_opinion_aspects[aspect] == polarity:
            df.loc[ii] = [id, text, auxiliary_sentence, int(1)]
            ii += 1
            aspect_found = True
          else:
            df.loc[ii] = [id, text, auxiliary_sentence, int(0)]
            ii += 1
        
        # If no sentiment is found for the `location-aspect`, then add the 
        # auxiliary sentence with None to the dataset.
        
        auxiliary_sentence = f"the polarity of the aspect {aspect} of location - {str(count_location)} is None."
        if not aspect_found:
          df.loc[ii] = [id, text, auxiliary_sentence, int(1)]
          ii += 1
        else:
          df.loc[ii] = [id, text, auxiliary_sentence, int(0)]
          ii += 1
    
  count_location += 1

100%|███████████████████████████████████████| 1491/1491 [04:55<00:00,  5.04it/s]
100%|███████████████████████████████████████| 1491/1491 [02:10<00:00, 11.46it/s]


In [None]:
df['sentiment'] = df['sentiment'].astype(int)

In [None]:
# df.to_csv('/content/drive/My Drive/SentiHood/Bert-pair/QA-B/Datasets/testing_set.csv', index=False)

## Datasets for NLI-B

For NLI-B, 3 auxiliary pseudo-sentences in the following form will be constructed for each `location-aspect`.

*`location` - `aspect` - `polarity`*

### Training set for BERT-pair NLI-B

In [None]:
with open('/content/drive/My Drive/SentiHood/SentiHood Dataset/sentihood-train.json', 'r') as fp:
    training_set = json.load(fp)

In [None]:
locations = ['LOCATION1', 'LOCATION2']
aspects = ['dining', 'general', 'green-nature', 'live', 'multicultural', 'nightlife', 'price', 'quiet', 'safety', 'shopping', 'touristy', 'transit-location']

df = pd.DataFrame({'id': [], 'text': [], 'auxiliary_sentence': [], 'sentiment': []})
ii = 0

count_location = 1
for location in locations:
  for each_example in tqdm(training_set, ncols=80):
    id = str(int(each_example['id']))
    text = each_example['text'].strip()

    # If `location` is present in the text, only then iterate over the  
    # list of opinions to find suitable `location-aspect` datapoints.

    if location in text:
      # current_opinion_aspects: Is a dictionary containing all aspects and  
      # their corresponding sentiment present in the current opinion.

      current_opinion_aspects = {}
      for opinion in each_example['opinions']:
        if opinion['target_entity'] == location:
          current_opinion_aspects[opinion['aspect']] = opinion['sentiment']
      
      text = text.replace(location, 'location - ' + str(count_location))
      
      for aspect in aspects:
        aspect_found = False
        
        for polarity in ['Positive', 'Negative']:
          auxiliary_sentence = f"location - {str(count_location)} - {aspect} - {polarity}."
          
          if aspect in current_opinion_aspects.keys() and current_opinion_aspects[aspect] == polarity:
            df.loc[ii] = [id, text, auxiliary_sentence, int(1)]
            ii += 1
            aspect_found = True
          else:
            df.loc[ii] = [id, text, auxiliary_sentence, int(0)]
            ii += 1
        
        # If no sentiment is found for the `location-aspect`, then add the 
        # auxiliary sentence with None to the dataset.
        
        auxiliary_sentence = f"location - {str(count_location)} - {aspect} - None."
        if not aspect_found:
          df.loc[ii] = [id, text, auxiliary_sentence, int(1)]
          ii += 1
        else:
          df.loc[ii] = [id, text, auxiliary_sentence, int(0)]
          ii += 1
    
  count_location += 1

100%|███████████████████████████████████████| 2977/2977 [14:10<00:00,  3.50it/s]
100%|███████████████████████████████████████| 2977/2977 [06:40<00:00,  7.44it/s]


In [None]:
df['sentiment'] =  df['sentiment'].astype(int)

In [None]:
# df.to_csv('/content/drive/My Drive/SentiHood/Bert-pair/NLI-B/Datasets/training_set.csv', index=False)

### Validation set for BERT-pair NLI-B

In [None]:
with open('/content/drive/My Drive/SentiHood/SentiHood Dataset/sentihood-dev.json', 'r') as fp:
    validation_set = json.load(fp)

In [None]:
locations = ['LOCATION1', 'LOCATION2']
aspects = ['dining', 'general', 'green-nature', 'live', 'multicultural', 'nightlife', 'price', 'quiet', 'safety', 'shopping', 'touristy', 'transit-location']

df = pd.DataFrame({'id': [], 'text': [], 'auxiliary_sentence': [], 'sentiment': []})
ii = 0

count_location = 1
for location in locations:
  for each_example in tqdm(validation_set, ncols=80):
    id = str(int(each_example['id']))
    text = each_example['text'].strip()

    # If `location` is present in the text, only then iterate over the  
    # list of opinions to find suitable `location-aspect` datapoints.

    if location in text:
      # current_opinion_aspects: Is a dictionary containing all aspects and  
      # their corresponding sentiment present in the current opinion.

      current_opinion_aspects = {}
      for opinion in each_example['opinions']:
        if opinion['target_entity'] == location:
          current_opinion_aspects[opinion['aspect']] = opinion['sentiment']
      
      text = text.replace(location, 'location - ' + str(count_location))
      
      for aspect in aspects:
        aspect_found = False
        
        for polarity in ['Positive', 'Negative']:
          auxiliary_sentence = f"location - {str(count_location)} - {aspect} - {polarity}."
          
          if aspect in current_opinion_aspects.keys() and current_opinion_aspects[aspect] == polarity:
            df.loc[ii] = [id, text, auxiliary_sentence, int(1)]
            ii += 1
            aspect_found = True
          else:
            df.loc[ii] = [id, text, auxiliary_sentence, int(0)]
            ii += 1
        
        # If no sentiment is found for the `location-aspect`, then add the 
        # auxiliary sentence with None to the dataset.
        
        auxiliary_sentence = f"location - {str(count_location)} - {aspect} - None."
        if not aspect_found:
          df.loc[ii] = [id, text, auxiliary_sentence, int(1)]
          ii += 1
        else:
          df.loc[ii] = [id, text, auxiliary_sentence, int(0)]
          ii += 1
    
  count_location += 1

100%|█████████████████████████████████████████| 747/747 [01:58<00:00,  6.30it/s]
100%|█████████████████████████████████████████| 747/747 [00:40<00:00, 18.61it/s]


In [None]:
df['sentiment'] =  df['sentiment'].astype(int)

In [None]:
# df.to_csv('/content/drive/My Drive/SentiHood/Bert-pair/NLI-B/Datasets/validation_set.csv', index=False)

### Testing set for BERT-pair NLI-B

In [4]:
with open('/content/drive/My Drive/SentiHood/SentiHood Dataset/sentihood-test.json', 'r') as fp:
    testing_set = json.load(fp)

In [5]:
locations = ['LOCATION1', 'LOCATION2']
aspects = ['dining', 'general', 'green-nature', 'live', 'multicultural', 'nightlife', 'price', 'quiet', 'safety', 'shopping', 'touristy', 'transit-location']

df = pd.DataFrame({'id': [], 'text': [], 'auxiliary_sentence': [], 'sentiment': []})
ii = 0

count_location = 1
for location in locations:
  for each_example in tqdm(testing_set, ncols=80):
    id = str(int(each_example['id']))
    text = each_example['text'].strip()

    # If `location` is present in the text, only then iterate over the  
    # list of opinions to find suitable `location-aspect` datapoints.

    if location in text:
      # current_opinion_aspects: Is a dictionary containing all aspects and  
      # their corresponding sentiment present in the current opinion.

      current_opinion_aspects = {}
      for opinion in each_example['opinions']:
        if opinion['target_entity'] == location:
          current_opinion_aspects[opinion['aspect']] = opinion['sentiment']
      
      text = text.replace(location, 'location - ' + str(count_location))
      
      for aspect in aspects:
        aspect_found = False
        
        for polarity in ['Positive', 'Negative']:
          auxiliary_sentence = f"location - {str(count_location)} - {aspect} - {polarity}."
          
          if aspect in current_opinion_aspects.keys() and current_opinion_aspects[aspect] == polarity:
            df.loc[ii] = [id, text, auxiliary_sentence, int(1)]
            ii += 1
            aspect_found = True
          else:
            df.loc[ii] = [id, text, auxiliary_sentence, int(0)]
            ii += 1
        
        # If no sentiment is found for the `location-aspect`, then add the 
        # auxiliary sentence with None to the dataset.
        
        auxiliary_sentence = f"location - {str(count_location)} - {aspect} - None."
        if not aspect_found:
          df.loc[ii] = [id, text, auxiliary_sentence, int(1)]
          ii += 1
        else:
          df.loc[ii] = [id, text, auxiliary_sentence, int(0)]
          ii += 1
    
  count_location += 1

100%|███████████████████████████████████████| 1491/1491 [04:16<00:00,  5.80it/s]
100%|███████████████████████████████████████| 1491/1491 [01:40<00:00, 14.83it/s]


In [6]:
df['sentiment'] =  df['sentiment'].astype(int)

In [8]:
# df.to_csv('/content/drive/My Drive/SentiHood/Bert-pair/NLI-B/Datasets/testing_set.csv', index=False)