<a href="https://colab.research.google.com/github/SerenaYKim/Carmen/blob/master/2022_11_18_SolarSentimentRoBERTa.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Setup

In [1]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)
  

Fri Nov 18 20:23:17 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  A100-SXM4-40GB      Off  | 00000000:00:04.0 Off |                    0 |
| N/A   28C    P0    45W / 400W |      0MiB / 40536MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('To enable a high-RAM runtime, select the Runtime > "Change runtime type"')
  print('menu, and then select High-RAM in the Runtime shape dropdown. Then, ')
  print('re-execute this cell.')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 89.6 gigabytes of available RAM

You are using a high-RAM runtime!


In [3]:
import tensorflow as tf
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: A100-SXM4-40GB


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
import torch
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from functools import reduce
from datetime import datetime
from sklearn.model_selection import train_test_split

## Data

In [76]:
df = pd.read_csv("/content/drive/MyDrive/G06-SolarSentiment/dataV2Train/2022-11-18-tweets-training-9k-v09.csv")
    # cleaned from 2022-09-09-UpdatingTrainingData.ipynb

df.set_index('id', inplace=True)
df = df.sample(frac = 1)
df

Unnamed: 0_level_0,text,category
id,Unnamed: 1_level_1,Unnamed: 2_level_1
2.454415e-01,Solar Energy Data for 2020-03-09 13:15:00 Ener...,Neutral
8.118685e+17,@WajSKhan power at rate where no one can buy i...,Negative
9.909449e-01,Affordable alternative to grid: New report by ...,Positive
3.683650e+05,Electricity generation is done in many ways. H...,Negative
3.967850e+05,Expandable Solar Generators - 100ah to 300ah d...,Positive
...,...,...
4.102570e+05,Energy in space isn't scarce. Solar panels are...,Positive
2.854660e+05,. Russia just to meet the power demands if Ame...,Negative
5.766557e-01,It will not happen with our current governor o...,Positive
9.130000e+17,RT: Our #ClimateAction is to encourage citizen...,Positive


In [77]:
# If you want to assign numerical values automatically 

df = df[~df.category.str.contains('\|')]
df = df[df.category != 'nocode']
possible_labels = df.category.unique()

label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index
    
df['label'] = df.category.replace(label_dict)

df.dtypes

#if you want to create labels manually https://stackoverflow.com/questions/19226488/change-one-value-based-on-another-value-in-pandas

# df.loc[df.category == "Negative", 'label'] = 0
# df.loc[df.category == "Neutral", 'label'] = 1
# df.loc[df.category == "Positive", 'label'] = 2

text        object
category    object
label        int64
dtype: object

In [78]:
df.category.value_counts()

Positive    5672
Neutral     2279
Negative    2157
Name: category, dtype: int64

In [79]:
df.label.value_counts()

2    5672
0    2279
1    2157
Name: label, dtype: int64

## Training/Validation Split

In [81]:
X_train, X_val, y_train, y_val = train_test_split(df.index.values, 
                                                  df.label.values, 
                                                  test_size=0.1, 
                                                  random_state= 768, #864 #768
                                                  stratify=df.label.values)

In [82]:
df['data_type'] = ['not_set'] * df.shape[0]
df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'
df.groupby(['category', 'label', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,text
category,label,data_type,Unnamed: 3_level_1
Negative,1,train,1928
Negative,1,val,229
Neutral,0,train,2020
Neutral,0,val,259
Positive,2,train,5030
Positive,2,val,642


## Loading Tokenizer and Encoding our Data

In [83]:
!pip install transformers==3.0.2

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [84]:
from torch.utils.data import TensorDataset
from transformers import RobertaTokenizer, RobertaConfig, RobertaForSequenceClassification, AdamW

In [85]:
# load tokenizer: 'roberta-base', 'bert-base-uncased', 

print('Loading RoBERTa tokenizer...')
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True)

Loading RoBERTa tokenizer...


In [86]:
encoded_data_train = tokenizer.batch_encode_plus(
    df[df.data_type=='train'].text.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    df[df.data_type=='val'].text.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
)

input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df[df.data_type=='train'].label.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(df[df.data_type=='val'].label.values)



In [87]:
dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

## Setting up BERT Pretrained Model

In [88]:
from transformers import RobertaTokenizerFast, RobertaForSequenceClassification

In [89]:
# BERT model for sequence classification 

model = RobertaForSequenceClassification.from_pretrained("roberta-base",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Creating Data Loaders

In [90]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

In [91]:
batch_size = 64 # parameter1 

dataloader_train = DataLoader(dataset_train, 
                              sampler=RandomSampler(dataset_train), 
                              batch_size=batch_size)

dataloader_validation = DataLoader(dataset_val, 
                                   sampler=SequentialSampler(dataset_val), 
                                   batch_size=batch_size)

## Setting Up Optimiser and Scheduler


In [92]:
from transformers import AdamW, get_linear_schedule_with_warmup #SGD 

In [93]:
lr = 7e-6
eps = 1e-6
optimizer = AdamW(model.parameters(),
                  lr = lr,  
                  eps = eps)

In [94]:
epochs = 4 # parameter 4 

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)

## Defining our Performance Metrics

Accuracy metric approach originally used in accuracy function in [this tutorial](https://mccormickml.com/2019/07/22/BERT-fine-tuning/#41-bertforsequenceclassification).

[Argmax function](https://deeplizard.com/learn/video/K3lX3Cltt4c)

In [95]:
from sklearn.metrics import f1_score

In [96]:
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

In [97]:
def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')

In [98]:
def total_accuracy(preds, labels):
  label_dict_inverse = {v: k for k, v in label_dict.items()}
    
  preds_flat = np.argmax(preds, axis=1).flatten()
  labels_flat = labels.flatten()

  numerator = 0
  denominator = 0

  for label in np.unique(labels_flat):
      y_preds = preds_flat[labels_flat==label]
      y_true = labels_flat[labels_flat==label]
      numerator += len(y_preds[y_preds==label])
      denominator += len(y_true)
  accuracy = numerator / denominator
  return accuracy

## Creating our Training Loop

Approach adapted from an older version of HuggingFace's `run_glue.py` script. Accessible [here](https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128).

In [99]:
import random

seed_val = 4
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [100]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print(device)

cuda


In [101]:
def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

In [None]:
f1_list = []
for epoch in tqdm(range(1, epochs+1)):
    
    model.train()
    
    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        model.zero_grad()
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }       

        outputs = model(**inputs)
        
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
        
    torch.save(model.state_dict(), f'finetuned_BERT_epoch_{epoch}.model')
        
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)            
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals = evaluate(dataloader_validation)
    val_f1 = f1_score_func(predictions, true_vals)
    val_acc = accuracy_per_class(predictions, true_vals)
    total_acc = total_accuracy(predictions, true_vals)
    
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')
    tqdm.write(f'Accuracy: {total_acc}')
    f1_list.append(val_f1)

  0%|          | 0/4 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/141 [00:00<?, ?it/s]


Epoch 1
Training loss: 0.837253082517191
Class: Neutral
Accuracy: 102/259

Class: Negative
Accuracy: 169/229

Class: Positive
Accuracy: 582/642

Validation loss: 0.6327653411361907
F1 Score (Weighted): 0.7380112063624197
Accuracy: 0.7548672566371681


Epoch 2:   0%|          | 0/141 [00:00<?, ?it/s]


Epoch 2
Training loss: 0.5873114428621657
Class: Neutral
Accuracy: 90/259

Class: Negative
Accuracy: 168/229

Class: Positive
Accuracy: 589/642

Validation loss: 0.6182804356018702
F1 Score (Weighted): 0.7276690580193877
Accuracy: 0.7495575221238938


Epoch 3:   0%|          | 0/141 [00:00<?, ?it/s]


Epoch 3
Training loss: 0.5362006928903836
Class: Neutral
Accuracy: 107/259

Class: Negative
Accuracy: 179/229

Class: Positive
Accuracy: 575/642

Validation loss: 0.6034265955289205
F1 Score (Weighted): 0.7473924308538933
Accuracy: 0.7619469026548673


Epoch 4:   0%|          | 0/141 [00:00<?, ?it/s]

In [55]:
model.to(device)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm

In [None]:
#If you want to use BERT instead
# model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
#                                                      num_labels=len(label_dict),
#                                                      output_attentions=False,
#                                                      output_hidden_states=False)
#model = RobertaForSequenceClassification.from_pretrained("roberta-base",
#                                                      num_labels=len(label_dict),
#                                                      output_attentions=False,
#                                                      output_hidden_states=False)

# model.to(device)

In [36]:
# Saving the best model; 
model.load_state_dict(torch.load('/content/finetuned_BERT_epoch_2.model', map_location=torch.device('cpu')))

<All keys matched successfully>

In [37]:
_, predictions, true_vals = evaluate(dataloader_validation)

In [38]:
accuracy_per_class(predictions, true_vals)

Class: Positive
Accuracy: 563/648

Class: Negative
Accuracy: 185/226

Class: Neutral
Accuracy: 149/254



In [40]:
185/226

0.8185840707964602

In [39]:
149/254

0.5866141732283464

In [41]:
563/648

0.8688271604938271

In [None]:
def save_stats(total_accuracy, f1_list, batch_size, lr, eps, epochs, optimizer):
  filename = '/content/drive/MyDrive/G06-SolarSentiment/dataV2Train/10_Results/SolarSentimentAccuracy-2022-08-10-02.csv'
  with open(filename, 'a') as outfile:
    # outfile.write('Timestamp, Accuracy, F1, Batch Size, Lr, Eps, Epochs, Optimizer' + '\n') 
    outfile.write(str(datetime.now()) + ',' + str(total_accuracy) + ',' 
                  + '"' + str(f1_list) + '"' + ',' + str(batch_size) + ',' + str(lr) + ',' 
                  + str(eps) + ',' + str(epochs) + ',' + str(optimizer) + '\n')

In [None]:
accuracy = total_accuracy(predictions, true_vals)
print(accuracy)
save_stats(accuracy, f1_list, batch_size, lr, eps, epochs, 'AdamW')

0.7760532150776053


## Prediction Test

In [None]:
df_pred = pd.read_csv('/content/drive/MyDrive/G06-SolarSentiment/dataV2Raw/Daily/2020/2020-01/solarsent_2020-01-03.csv')
df_prediction.set_index('id', inplace=True)

In [None]:
df_prediction = pd.read_csv('/content/drive/MyDrive/G06-SolarSentiment/dataTraining/test-prediction.csv', names=['id', 'text', 'category'])
df_prediction.set_index('id', inplace=True)

FileNotFoundError: ignored

In [None]:
possible_labels = df_prediction.category.unique()

NameError: ignored

In [None]:
label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index

In [None]:
df_prediction['label'] = df_prediction.category.replace(label_dict)

In [None]:
df_prediction['data_type'] = "pred"

In [None]:
df_prediction

In [None]:
encoded_data_pred = tokenizer.batch_encode_plus(
    df_prediction[df_prediction.data_type=='pred'].text.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
)

input_ids_pred = encoded_data_pred['input_ids']
attention_masks_pred = encoded_data_pred['attention_mask']
labels_pred = torch.tensor(df_prediction[df_prediction.data_type=='pred'].label.values)

In [None]:
df_prediction['id'] = df_prediction.index.astype(int)
phraseids = df_prediction.id.values
phraseids = torch.tensor(phraseids)

In [None]:
dataset_pred = TensorDataset(input_ids_pred, attention_masks_pred, labels_pred)

In [None]:
dataloader_pred = DataLoader(dataset_pred, 
                                   sampler=SequentialSampler(dataset_pred), 
                                   batch_size=batch_size)

In [None]:
def evaluate(dataloader_pred):

    model.eval()
    
    loss_pred_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_pred:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2]
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_pred_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_pred_total/len(dataloader_pred) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)   

    return loss_val_avg, predictions, true_vals

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print(device)

In [None]:
_, predictions, true_vals = evaluate(dataloader_pred)

In [None]:
def predicted_list_generate(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    df_label_created = pd.DataFrame() 

    predicted = preds_flat.tolist()

    return predicted

In [None]:
prediction = predicted_list_generate(predictions, true_vals)
df_prediction['prediction'] = prediction

In [None]:
df_prediction['predicted_label'] = np.where(df_prediction['label'] == df_prediction['prediction'], 'True', 'False')

## Prediction

In [None]:
import pandas as pd 
df_prediction = pd.DataFrame() 
df_tweet_master = pd.read_csv('/content/drive/MyDrive/G06-SolarSentiment/dataV2Raw/daily/2017/2017-04/solarsent_2017-04-01.csv')
#import pandas as pd 
#df_prediction = pd.DataFrame() 
#df_tweet_master = pd.read_csv('/content/drive/MyDrive/G06-SolarSentiment/dataTwitterCsv2020/state-master-2020.csv')

In [None]:
df_subset = df_tweet_master[["created_at", "user_id", "text"]]
df_subset['id'] = df_subset.index
df_prediction = df_subset
df_prediction["text"] = df_prediction["text"].astype(str) 
df_prediction['data_type'] = "pred"
df_prediction['category'] = "None" # Not the actual sentiment. It is a placeholder. 
#df_tweet_master = pd.read_csv('/content/drive/MyDrive/G06-SolarSentiment/dataTwitterCsv2020/state-master-2020.csv') #state information identified tweets full
#df_subset = df_tweet_master[["text", "quoted_text", "std_location", "location", "created_at"]]
#df_subset['id'] = df_subset.index

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: 

In [None]:
#df_prediction['text'] = df_subset['text'] + " " + df_subset['quoted_text']
#df_prediction['id'] = df_subset['id']
#df_prediction['location'] = df_subset['std_location']
#df_prediction['location_raw'] = df_subset["location"]
#df_prediction['day'] = df_subset["created_at"]
#df_prediction["text"] = df_prediction["text"].astype(str) 
#df_prediction['data_type'] = "pred"
#df_prediction['category'] = "Positive" # Not the actual sentiment. It is a placeholder. 

In [None]:
#df_prediction["text"] = df_prediction["text"].astype(str) 
#df_prediction = df_prediction.dropna(subset=['text'])
#len(df_prediction.index)

In [None]:
df_prediction["text"] = df_prediction["text"].str.replace("RT", " ", case = True) 
df_prediction["text"] = df_prediction["text"].str.replace("https", " ", case = False) 
df_prediction["text"] = df_prediction["text"].str.replace("@[^\s]+", " ", case = False)

  This is separate from the ipykernel package so we can avoid doing imports until


In [None]:
possible_labels = df_prediction.category.unique()

label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index

df_prediction['label'] = df_prediction.category.replace(label_dict)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [None]:
df_prediction

Unnamed: 0,created_at,user_id,text,id,data_type,category,label
0,2017-04-01T00:00:00.000Z,587257597,Despite the cloudiest March since @BOM_au reco...,0,pred,,0
1,2017-04-01T00:00:09.000Z,3022234980,"B2 30,000 mAh Solar+10000mAh Power Bank\n\nBuy...",1,pred,,0
2,2017-04-01T00:00:32.000Z,619572621,U.S. coal saw a 14% decline in mining jobs fro...,2,pred,,0
3,2017-04-01T00:00:33.000Z,595756374,RT: Scientists have found a way to collect sol...,3,pred,,0
4,2017-04-01T00:01:50.000Z,836329222729777152,America doesn't have enough füçking fossil fue...,4,pred,,0
...,...,...,...,...,...,...,...
4609,2017-04-01T23:54:38.000Z,59415917,RT: Viewed the Hopsol-Cenored solar Power Plan...,4609,pred,,0
4610,2017-04-01T23:55:24.000Z,148828676,@FoxNews Far too little. Solar- and wind-prod...,4610,pred,,0
4611,2017-04-01T23:56:35.000Z,185614627,RT: The solar industry was responsible for cre...,4611,pred,,0
4612,2017-04-01T23:57:29.000Z,387862060,RT: @iDroneHD @GeorgeTakei 100 sq mi of solar ...,4612,pred,,0


In [None]:
encoded_data_pred = tokenizer.batch_encode_plus(
    df_prediction[df_prediction.data_type=='pred'].text.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
)

input_ids_pred = encoded_data_pred['input_ids']
attention_masks_pred = encoded_data_pred['attention_mask']
labels_pred = torch.tensor(df_prediction[df_prediction.data_type=='pred'].label.values)



In [None]:
dataset_pred = TensorDataset(input_ids_pred, attention_masks_pred, labels_pred)

In [None]:
dataloader_pred = DataLoader(dataset_pred, 
                                   sampler=SequentialSampler(dataset_pred), 
                                   batch_size=batch_size)

In [None]:
def evaluate(dataloader_pred):

    model.eval()
    
    loss_pred_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_pred:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2]
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_pred_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_pred_total/len(dataloader_pred) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)   

    return loss_val_avg, predictions, true_vals

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print(device)

cuda


In [None]:
_, predictions, true_vals = evaluate(dataloader_pred)

In [None]:
def predicted_list_generate(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    df_label_created = pd.DataFrame() 

    predicted = preds_flat.tolist()

    return predicted

In [None]:
prediction = predicted_list_generate(predictions, true_vals)
df_prediction['prediction'] = prediction

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [None]:
df_prediction.prediction.describe()

count    4614.000000
mean        1.651929
std         0.720244
min         0.000000
25%         2.000000
50%         2.000000
75%         2.000000
max         2.000000
Name: prediction, dtype: float64

In [None]:
df_prediction.prediction.value_counts()

2    3681
0     673
1     260
Name: prediction, dtype: int64

In [None]:
df_prediction.to_csv("test.csv")

####### currently the numbers are matched as follows: 

Positive    2967
Negative    2438
Neutral     2046
Name: category, dtype: int64



0    2967
1    2438
2    2046
Name: label, dtype: int64



In [None]:
#currently, neutral = 2; negative = 0; positive = 1
df_prediction.loc[df_prediction.prediction == 0, 'sentiment'] = -10
df_prediction.loc[df_prediction.prediction == 2, 'sentiment'] = 0
df_prediction.loc[df_prediction.prediction == 1, 'sentiment'] = 10


In [None]:
#currently, neutral = 2; negative = 1; positive = 0

df_prediction.loc[df_prediction.prediction == 0, 'sentiment'] = 10
df_prediction.loc[df_prediction.prediction == 2, 'sentiment'] = 0
df_prediction.loc[df_prediction.prediction == 1, 'sentiment'] = -10

In [None]:
df_prediction.loc[df_prediction.sentiment == 0, 'sent'] = "positive"
df_prediction.loc[df_prediction.sentiment == 10, 'sent'] = "negative"
df_prediction.loc[df_prediction.sentiment == -10, 'sent'] = "neutral"

In [None]:
df_prediction.loc[df_prediction.sent == "positive", 'sentiment'] = 10
df_prediction.loc[df_prediction.sent == "negative", 'sentiment'] = -10
df_prediction.loc[df_prediction.sent == "neutral", 'sentiment'] = 0

In [None]:
df_prediction

In [None]:
df_prediction.prediction.describe() # previous! 

In [None]:
df_prediction = df_prediction.drop(columns=['data_type', 'id', 'category', 'label', 'prediction'])

In [None]:
# tweeted dates standardized

df_prediction['date_tweeted'] = df_prediction['day'].str[4:10] # Extracting Month and Day Only 
df_prediction['date_tweeted'] = df_prediction['date_tweeted'] + ", 2020" # Adding the year again
df_prediction = df_prediction.drop(columns=["day"])

In [None]:
df_prediction.to_csv("/content/drive/MyDrive/G06-SolarSentiment/dataOutState/state_tweet_predicted_new2.csv")

### df_prediction

In [None]:
df_prediction.sentiment.value_counts()

### subset of prediction

In [None]:
#import pandas as pd 
#df_predicteion = pd.read_csv("/content/drive/MyDrive/G06-SolarSentiment/dataOutState/state_tweet_predicted.csv")

In [None]:
# cleaning
#df_prediction = df_prediction[~df_prediction['location_raw'].str.endswith(('scotland', 'Scotland', 'SCOTLAND'))]
#df_prediction = df_prediction[~df_prediction['location_raw'].str.endswith(('canada', 'Canada', 'CANADA'))]
#df_prediction = df_prediction[~df_prediction['location_raw'].str.endswith(('New Zealand', 'new zealand', 'NEW ZEALAND'))]
#df_prediction = df_prediction[~df_prediction['location_raw'].str.endswith(('New South Wales', 'new south wales', 'NEW SOUTH WALES'))]
#df_prediction = df_prediction[~df_prediction['location_raw'].str.endswith(('Nenagh', 'nenagh', 'NENAGH'))]
#df_prediction = df_prediction[~df_prediction['location_raw'].str.endswith(('Nederland', 'nederland', 'NEDERLAND'))]
#df_prediction = df_prediction[~df_prediction['location_raw'].str.endswith(('Nederlands', 'Nederlands', 'NEDERLANDS'))]
#df_prediction = df_prediction[~df_prediction['location_raw'].str.endswith(('nepa', 'Nepa', 'NEPA'))]
#df_prediction = df_prediction[~df_prediction['location_raw'].str.endswith(('New England', 'new england', 'NEW ENGLAND'))]
#df_prediction = df_prediction[~df_prediction['location_raw'].str.endswith(('New Orleans', 'new orleans', 'NEW ORLEANS'))]
#df_prediction = df_prediction[~df_prediction['location_raw'].str.endswith(('New Brunswick', 'NEW BRUNSWICK', 'new brunswick'))]
#df_prediction = df_prediction[~df_prediction['location_raw'].str.endswith(('Nepal', 'nepal', 'NEPAL'))]

In [None]:

df_prediction.to_csv("/content/drive/MyDrive/G06-SolarSentiment/dataOutState/state_tweet_predicted.csv")
df_prediction

In [None]:
df_prediction = df_prediction[~df_prediction['location_raw'].str.endswith(('Nepal', 'nepal', 'NEPAL'))]

In [None]:
df = pd.DataFrame()
df = pd.read_csv("/content/drive/MyDrive/G06-SolarSentiment/dataOutState/state_tweet_predicted_new2.csv")

In [None]:
df_prediction.sentiment.describe()
df_merged_tweet = pd.DataFrame()

In [None]:
# sentiment score by state
summarized = df_prediction.groupby("location")["sentiment"].mean()
summarized = summarized.to_frame()
summarized = summarized.reset_index()

In [None]:
# sentiment score standard error  by state
standarder = df_prediction.groupby("location")["sentiment"].sem()
standarder = standarder.to_frame()
standarder = standarder.reset_index()
standarder = standarder.rename(columns={"sentiment": "se"})

In [None]:
# sentiment count by state 
count = df_prediction.groupby("location")["sentiment"].count()
count = count.to_frame()
count = count.reset_index()
count = count.rename(columns={"sentiment": "tweetcount"})

In [None]:
# sentiment score by dates
avg_sent_date = df_prediction.groupby("date_tweeted")["sentiment"].mean()
avg_sent_date  = avg_sent_date.to_frame()
avg_sent_date  = avg_sent_date.reset_index()
avg_sent_date  = avg_sent_date.rename(columns={"sentiment": "average_sentiment_by_dates"})

In [None]:
from functools import reduce
frames = [summarized, standarder, count]
df_merged_tweet = reduce(lambda  left,right: pd.merge(left,right,on=['location'],
                                            how='inner'), frames).fillna('0')

In [None]:
# creating 95% confidence interval 

df_merged_tweet["upper"] = df_merged_tweet["sentiment"] + 1.95 * df_merged_tweet["se"]
df_merged_tweet["lower"] = df_merged_tweet["sentiment"] - 1.95 * df_merged_tweet["se"]
df_merged_tweet

In [None]:
df_merged_tweet = df_merged_tweet.rename(columns={"location": "state_abbr"})
df_merged_tweet.to_csv("/content/drive/MyDrive/G06-SolarSentiment/dataOutState/state_sentiment_score9.csv")

In [None]:
avg_sent_date.to_csv("/content/drive/MyDrive/G06-SolarSentiment/dataOutState/avg_sent_date.csv")

In [None]:
df_merged_tweet.tweetcount.describe()

### sanity check based on scatter plot (pop vs tweetcount)


In [None]:
south_carolina_test = df_prediction[df_prediction.location == "SC"]
south_carolina_test.to_csv("/content/drive/MyDrive/G06-SolarSentiment/dataOutState/sc_test.csv")

In [None]:
de_test = df_prediction[df_prediction.location == "DE"]
de_test.to_csv("/content/drive/MyDrive/G06-SolarSentiment/dataOutState/de_test.csv")

In [None]:
california_test = df_prediction[df_prediction.location == "CA"]
california_test.to_csv("/content/drive/MyDrive/G06-SolarSentiment/dataOutState/ca_test.csv")

In [None]:
ne_test = df_prediction[df_prediction.location == "NE"]
ne_test.to_csv("/content/drive/MyDrive/G06-SolarSentiment/dataOutState/ne_test.csv")

In [None]:
ak_test = df_prediction[df_prediction.location == "AK"]
ak_test.to_csv("/content/drive/MyDrive/G06-SolarSentiment/dataOutState/ak_test.csv")

### Attaching Census No.

In [None]:
city_df = pd.read_csv("/content/drive/MyDrive/G06-SolarSentiment/dataCity/city-list-abbr-tomatch.csv")

In [None]:
city_df['std_location'] = city_df['value'] +", " + city_df['state_abbr'] 

In [None]:
city_df = city_df.drop_duplicates(subset=['std_location'])

In [None]:
frames = [df_merged_tweet, city_df]
df_merged_tweet = reduce(lambda  left,right: pd.merge(left,right,on=['std_location'],
                                            how='inner'), frames).fillna('0')

In [None]:
df_tweet_std = df_merged_tweet[['std_location', 'sentiment', 'tweetcount']]

In [None]:
df_tweet_std 

In [None]:
city_df['location'] = city_df['value'] + " " + city_df['padding'] +", " + city_df['state_name'] 
city_df['city'] = city_df['value']

city_df

In [None]:
df_census_raw = pd.read_csv("/content/drive/MyDrive/G06-SolarSentiment/census_data/dataCensus/income_2019_place.csv")
df_census_raw = df_census_raw.rename(columns={"Geographic Area Name": "location"})

In [None]:
frames = [city_df, df_census_raw]
df_merged = reduce(lambda  left,right: pd.merge(left,right,on=['location'],
                                            how='inner'), frames).fillna('0')

In [None]:
df_census_std = df_merged[["id", "location", "city", "state_name", "state_fips", "state_abbr", "rank", "usregion", "sub_region", "std_location"]]

In [None]:
frames = [df_census_std, df_tweet_std]
df_solar_sentiment_city_master = reduce(lambda  left,right: pd.merge(left,right,on=['std_location'],
                                            how='inner'), frames).fillna('0')

In [None]:
df_solar_sentiment_city_master.to_csv("/content/drive/MyDrive/G06-SolarSentiment/dataCity/solar_sentiment_city_master.csv")

In [None]:
df_solar_sentiment_city_master = pd.read_csv("/content/drive/MyDrive/G06-SolarSentiment/dataCity/solar_sentiment_city_master.csv")

In [None]:
df_age = pd.read_excel("/content/drive/MyDrive/G06-SolarSentiment/dataCensus/census_age.xlsx")

In [None]:
df_race = pd.read_excel("/content/drive/MyDrive/G06-SolarSentiment/dataCensus/census_poprace.xlsx")

In [None]:
df_edu = pd.read_excel("/content/drive/MyDrive/G06-SolarSentiment/dataCensus/census_edu.xlsx")

In [None]:
df_income = pd.read_excel("/content/drive/MyDrive/G06-SolarSentiment/dataCensus/census_income.xlsx")

In [None]:
frames = [df_solar_sentiment_city_master, df_age, df_race, df_edu, df_income]
df_solarcitymaster = reduce(lambda  left,right: pd.merge(left,right,on=['id'],
                                            how='inner'), frames).fillna('0')

In [None]:
df_solarcitymaster["nonwhite"] = 1 - (df_solarcitymaster["White"]/df_solarcitymaster["Population"])

In [None]:
import numpy as np
df_solarcitymaster["income"] = np.log10(df_solarcitymaster["MedianIncome_S1903_C03_015E"])

In [None]:
df_solarcitymaster["std_sentiment"] = (df_solarcitymaster["sentiment"] - df_solarcitymaster["sentiment"].mean()) / df_solarcitymaster["sentiment"].std()

In [None]:
df_solarcitymaster["ln_pop"] = np.log10(df_solarcitymaster["Population"])

In [None]:
df_solarcitymaster["education"] = df_solarcitymaster["Bachelor_S1501_C02_012E"] + df_solarcitymaster["Graduate_S1501_C02_013E"]

In [None]:
df_solarcitymaster

### stats

In [None]:
import statsmodels.api as sm
from patsy import dmatrices

In [None]:
y, X = dmatrices('std_sentiment ~ nonwhite', data=df_solarcitymaster, return_type='dataframe')

In [None]:
y, X = dmatrices('std_sentiment ~ ln_pop', data=df_solarcitymaster, return_type='dataframe')

mod = sm.OLS(y, X)

res = mod.fit()

print(res.summary())

In [None]:
y, X = dmatrices('std_sentiment ~ education', data=df_solarcitymaster, return_type='dataframe')

mod = sm.OLS(y, X)

res = mod.fit()

print(res.summary())

In [None]:
y, X = dmatrices('std_sentiment ~ income', data=df_solarcitymaster, return_type='dataframe')

mod = sm.OLS(y, X)

res = mod.fit()

print(res.summary())

### reference

In [None]:
crst = pd.read_csv("/content/drive/MyDrive/G06-SolarSentiment/dataTwitterCsv2020/icma_count_merged.csv")

In [None]:
crst = crst.rename(columns={"std_location": "location"})

In [None]:
from functools import reduce

In [None]:
# Merging 
frames = [crst, summarized]
icma_sent_merged = reduce(lambda  left,right: pd.merge(left,right,on=['location'],
                                            how='inner'), frames)

In [None]:
icma_sent_merged.to_csv("test.csv")

In [None]:
def evaluate(dataloader_pred):

    model.eval()
    
    loss_pred_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_pred:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  #'labels':         batch[2]
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_pred_total += loss.item()

        logits = logits.detach().cpu().numpy()
        #label_ids = inputs['labels'].cpu().numpy()
        
        predictions.append(logits)
        #true_vals.append(label_ids)
    
    loss_val_avg = loss_pred_total/len(dataloader_pred) 
    
    predictions = np.concatenate(predictions, axis=0)
    #true_vals = np.concatenate(true_vals, axis=0)   

    return predictions

In [None]:
def predicted_list_generate(preds):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    #labels_flat = labels.flatten()
    df_label_created = pd.DataFrame() 

    predicted = preds_flat.tolist()

    return predicted

In [None]:
predictions = evaluate(dataloader_pred)