## Installations

In [None]:
from google.colab import drive
drive.mount('/content/drive')
# !cd drive
# !cd MyDrive/IP_S7/Video

Mounted at /content/drive


In [None]:
cd drive

/content/drive


In [None]:
cd "MyDrive/Data Science Project/Data"

/content/drive/MyDrive/Data Science Project/Data


## Imports

In [None]:
!pip install transformers
!pip install sentencepiece

Collecting transformers
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m32.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m24.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m50.9 MB/s[0m eta [36m0:00:00[0m
Col

In [None]:
from io import open
import string
import re
import random
import json
import torch
import copy
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
import pickle
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer
import pandas as pd
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [None]:
import os


## DataLoader

In [None]:
model_path = "siebert/sentiment-roberta-large-english"

In [None]:
class CityDataset(Dataset):
  def __init__(self, tokenizer, data):

    super(CityDataset, self).__init__()
    self.data = data
    self.tokenizer=tokenizer

  def __len__(self):

    return len(self.data)

  def __getitem__(self, index):

    text = self.data[index][0]
    inputs = self.tokenizer(text ,truncation=True,padding='max_length', return_tensors='pt')
    ids = inputs["input_ids"]
    mask = inputs["attention_mask"]
    return {
      'ids': ids[0],
      'mask': mask[0],
      'index': index
    }

BATCH_SIZE = 128
tokenizer = AutoTokenizer.from_pretrained(model_path)

Downloading (…)okenizer_config.json:   0%|          | 0.00/256 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/687 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_path).to(device)
sm = torch.nn.Softmax(dim=1)

Downloading pytorch_model.bin:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

In [None]:
dir = "Cities_Processed/"
num = 0
# Added only two cities for this script as we were running multiple instances with different cities to save time
ct = ["london.csv", "san-francisco.csv"]
for city in ct:
  # text = ""
  print(f"City: {city}")
  fname = os.path.join(dir, city)
  if os.path.isfile(fname):
    df = pd.read_csv(fname)
    data = df.values.tolist()
    unlb_data = CityDataset(tokenizer, data=data)
    dataloader = DataLoader(dataset= unlb_data, batch_size=BATCH_SIZE, shuffle=False)
    scores = []
    sentiment = []
    index = []
    for inputs in tqdm(dataloader):
      with torch.no_grad():
        out = model(input_ids = inputs['ids'].to(device), attention_mask = inputs['mask'].to(device))
        preds = out.logits.argmax(-1).cpu()
        sentiment.extend(pd.Series(preds).map(model.config.id2label).values.tolist())
        scores.extend(sm(out.logits)[:,1].tolist())
        index.extend(inputs['index'].tolist())
    data = [[a[0],b,c] for (a,b,c) in zip(data, sentiment, scores)]
    for i in range(len(index)):
      if i != index[i]:
        print("Data not sequential")
        break
    print(f"Length = {len(index)}")
    df = pd.DataFrame(data, columns=("Review","Sentiment", "Score"))
    df.to_csv(f"Cities_Labelled/{city}", index=False)

City: london.csv


  0%|          | 0/617 [00:00<?, ?it/s]

Length = 78946
City: san-francisco.csv


  0%|          | 0/237 [00:00<?, ?it/s]

Length = 30282
