# Sentiment Analysis using roBERTa model
## Code was taken from the Hugginface public library
## Source: https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment

In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.10.0-py3-none-any.whl (2.8 MB)
[K     |████████████████████████████████| 2.8 MB 12.0 MB/s 
Collecting huggingface-hub>=0.0.12
  Downloading huggingface_hub-0.0.16-py3-none-any.whl (50 kB)
[K     |████████████████████████████████| 50 kB 5.3 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 57.7 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 62.7 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 92.3 MB/s 
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: P

In [None]:
!git lfs install
!git clone https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment
# if you want to clone without large files – just their pointers
# prepend your git clone with the following env var:
GIT_LFS_SKIP_SMUDGE=1

git: 'lfs' is not a git command. See 'git --help'.

The most similar command is
	log
Cloning into 'twitter-roberta-base-sentiment'...
remote: Enumerating objects: 31, done.[K
remote: Counting objects: 100% (31/31), done.[K
remote: Compressing objects: 100% (29/29), done.[K
remote: Total 31 (delta 11), reused 0 (delta 0)[K
Unpacking objects: 100% (31/31), done.


In [None]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoModelForSequenceClassification

import numpy as np
from scipy.special import softmax
import csv
import urllib.request
import pandas as pd
import os
from google.colab import drive

In [None]:
# mount my Google Drive directory and access the training data located there
gdrive_dir = '/content/drive/'
data_dir = os.path.join(gdrive_dir, "'My Drive'","'Colab Notebooks'")

drive.mount(gdrive_dir, force_remount=True)

Mounted at /content/drive/


In [None]:
# cd '/content/drive/MyDrive/Colab Notebooks/survey_data'

In [None]:
!rm -r ./cardiffnlp

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/survey_data/Edited_Survey.csv')
df.head

<bound method NDFrame.head of      Python_machine  ...                                            Comment
0     highly likely  ...                                                NaN
1          unlikely  ...                                                NaN
2     highly likely  ...                                                NaN
3            likely  ...                                                NaN
4   highly unlikely  ...                                                NaN
5     highly likely  ...                                                NaN
6            likely  ...                                                NaN
7     highly likely  ...                                                NaN
8            likely  ...  To my mind it is quite impressive that machine...
9   highly unlikely  ...                                                NaN
10    highly likely  ...                                                 хз
11         unlikely  ...  Lots of unnecessary punctuation,

In [None]:
col_list = ['Comment']
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/survey_data/Edited_Survey.csv', usecols=col_list)
df.dropna(inplace=True)
df.head()

Unnamed: 0,Comment
8,To my mind it is quite impressive that machine...
10,хз
11,"Lots of unnecessary punctuation, strange phras..."
18,I think you did a great job. Congratulations!\...
20,There appear to be learning artifacts in many ...


In [None]:
print(df.loc[:, "Comment"])

8     To my mind it is quite impressive that machine...
10                                                   хз
11    Lots of unnecessary punctuation, strange phras...
18    I think you did a great job. Congratulations!\...
20    There appear to be learning artifacts in many ...
23    Very interesting project! \r\n\r\nThere were s...
Name: Comment, dtype: object


In [None]:
# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []
 
 
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

task='sentiment'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

tokenizer = AutoTokenizer.from_pretrained(MODEL)

# download label mapping
labels=[]
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]

# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.save_pretrained(MODEL)




Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

In [None]:
# text = df["Comments"]
# text = preprocess(text)
text = (df.loc[:, "Comment"]).to_string()


encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)
scores = output[0][0].detach().numpy()
scores = softmax(scores)

ranking = np.argsort(scores)
ranking = ranking[::-1]
for i in range(scores.shape[0]):
    l = labels[ranking[i]]
    s = scores[ranking[i]]
    print(f"{i+1}) {l} {np.round(float(s), 4)}")

1) positive 0.963
2) neutral 0.0304
3) negative 0.0067


In [None]:
print(text)

8     To my mind it is quite impressive that machine...
10                                                   хз
11    Lots of unnecessary punctuation, strange phras...
18    I think you did a great job. Congratulations!\...
20    There appear to be learning artifacts in many ...
23    Very interesting project! \r\n\r\nThere were s...
