# Setting

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers
!pip install sentencepiece
!pip install tweet-preprocessor

Collecting transformers
  Downloading transformers-4.12.2-py3-none-any.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 5.3 MB/s 
Collecting huggingface-hub>=0.0.17
  Downloading huggingface_hub-0.0.19-py3-none-any.whl (56 kB)
[K     |████████████████████████████████| 56 kB 5.1 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 44.0 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 40.1 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 39.4 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uni

In [None]:
import torch
from keras.preprocessing.sequence import pad_sequences
from transformers import XLNetTokenizer
from nltk import tokenize
import numpy as np
import preprocessor as p
import nltk
nltk.download('punkt')
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import XLNetForSequenceClassification

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
import torch.nn.functional as F

In [None]:
BATCH_SIZE = 32
MAX_LENGTH = 120
XLNET_MODEL = 'xlnet-base-cased'
FILE_PATH = '/content/drive/MyDrive/1027/'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load Model

In [None]:
tokenizer = XLNetTokenizer.from_pretrained(XLNET_MODEL, do_lower_case=True)
#model = torch.load('/content/drive/MyDrive/Colab Notebooks/classifier.pt',map_location=torch.device('cpu'))
#model = torch.load(FILE_PATH + 'classifier.pt')
model = XLNetForSequenceClassification.from_pretrained(XLNET_MODEL, num_labels=2)
model.load_state_dict(torch.load(FILE_PATH + 'model_v2_955.pt'))

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.bias', 'logits_proj.bias', 'sequence_summary.summary.weight', 'logits_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

<All keys matched successfully>

In [None]:
model.to(device)

XLNetForSequenceClassification(
  (transformer): XLNetModel(
    (word_embedding): Embedding(32000, 768)
    (layer): ModuleList(
      (0): XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (layer_1): Linear(in_features=768, out_features=3072, bias=True)
          (layer_2): Linear(in_features=3072, out_features=768, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (1): XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((768,), eps=1e

# Predict

In [None]:
def process(tweets,add_special_token=True):
  p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.MENTION) # tweet-preprocessor
  
  new_tweets = []
  for tweet in tweets:
    tweet = p.clean(tweet)
    sentences = tokenize.sent_tokenize(tweet)
    if add_special_token:
        tweet_with_token = ' [SEP]'.join(sentences) + ' [SEP] [CLS]'
        new_tweets.append(tweet_with_token)
    else:
        new_tweets.append(tweet)

  return new_tweets


def get_model_data(tokenizer, tweets, labels , maxlen, batch_size, add_special_token = True):
    
    if add_special_token:
        tweets = process(tweets)
    
    #tokenizer = XLNetTokenizer.from_pretrained(XLNET_MODEL, do_lower_case=True)
    
    tokenized_tweets = [tokenizer.tokenize(tweet) for tweet in tweets]
    # Use the XLNet tokenizer to convert the tokens to their index numbers in the XLNet vocabulary
    input_ids = [tokenizer.convert_tokens_to_ids(t) for t in tokenized_tweets]
    # Pad our input tokens
    input_ids = pad_sequences(input_ids, maxlen= maxlen, dtype="long", truncating="post", padding="post")
    
    # Create attention masks
    attention_masks = []
    
    # Create a mask of 1s for each token followed by 0s for padding
    for seq in input_ids:
      seq_mask = [float(i>0) for i in seq]
      attention_masks.append(seq_mask)
      
    # Convert all of our data into torch tensors, the required datatype for our model
    inputs = torch.tensor(input_ids)
    masks = torch.tensor(attention_masks)
    labels = torch.tensor(labels)
    
    # Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop, 
    # with an iterator the entire dataset does not need to be loaded into memory
    data = TensorDataset(inputs, masks, labels)
    sampler = SequentialSampler(data)
    dataloader = DataLoader(data, sampler=sampler, batch_size=batch_size)
    
    return data, sampler, dataloader

def predict(model, tokenizer, tweets,labels,maxlen = MAX_LENGTH, batch_size = BATCH_SIZE):


  prediction_data, prediction_sampler, prediction_dataloader = get_model_data(tokenizer, tweets, labels , maxlen, batch_size, add_special_token = True)

  # Tracking variables 
  predictions , true_labels = [], []
  prob_list = []

  # Predict 
  for batch in prediction_dataloader:
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    # Telling the model not to compute or store gradients, saving memory and speeding up prediction
    with torch.no_grad():
      # Forward pass, calculate logit predictions
      outputs = model(b_input_ids, b_input_mask)
      logits = outputs[0]
      #outputs = outputs[0][0].cpu().detach()
      #_, prediction = torch.max(outputs, dim =-1)
      result = outputs[0][0].cpu().detach()
      probs = F.softmax(result, dim=-1).cpu().detach().numpy().tolist()
      _, prediction = torch.max(result, dim =-1)

    # Move logits and labels to CPU
    #logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

     
    # Store predictions and true labels
    predictions.append(prediction)
    prob_list.append(probs)
    true_labels.append(label_ids)

    return predictions , prob_list

# Load Dataset

In [None]:
def get_tsa():
    
    path = 'data/Sentiment-Analysis-Dataset/Sentiment Analysis Dataset.csv'
    with open(path,'rb') as file:
        lines = file.readlines()
        #header = lines[0].decode('utf-8').rstrip().split(',')
        header = lines[0].rstrip().decode('utf-8').split(',')
        data_list = []
        for line in lines[1:]:
            line = line.rstrip()
            props = line.decode('utf-8').split(',')
            if len(props) > 4:
                sentiment_text = ','.join(props[3:]).strip('"')
                data = props[:3]
                data.append(sentiment_text)
            else:
                data = props
            data_list.append(data)
            # test index = 4286
    df = pd.DataFrame(data_list, columns = header)
    return df

In [None]:
import torch.nn.functional as F
txts = ['I am so happy']
labels = [1]
predictions , prob_list = predict(model, tokenizer, txts,labels,maxlen = MAX_LENGTH, batch_size = BATCH_SIZE)

In [None]:
predictions

[tensor(1)]

In [None]:
prob_list

[[0.0042115263640880585, 0.9957885146141052]]

    print("Positive score:", probs[1])
    print("Negative score:", probs[0])

In [None]:
outputs = outputs[0][0].cpu().detach()

    probs = F.softmax(outputs, dim=-1).cpu().detach().numpy().tolist()
    _, prediction = torch.max(outputs, dim =-1)

IndentationError: ignored

In [None]:
outputs = outputs[0][0].cpu().detach()
_, prediction = torch.max(outputs, dim =-1)

In [None]:
def predict_sentiment(text):
    review_text = text

    encoded_review = tokenizer.encode_plus(
    review_text,
    max_length=MAX_LENGTH,
    add_special_tokens=True,
    return_token_type_ids=False,
    pad_to_max_length=False,
    return_attention_mask=True,
    return_tensors='pt',
    )

    input_ids = pad_sequences(encoded_review['input_ids'], maxlen=MAX_LENGTH, dtype=torch.Tensor ,truncating="post",padding="post")
    input_ids = input_ids.astype(dtype = 'int64')
    input_ids = torch.tensor(input_ids) 

    attention_mask = pad_sequences(encoded_review['attention_mask'], maxlen=MAX_LENGTH, dtype=torch.Tensor ,truncating="post",padding="post")
    attention_mask = attention_mask.astype(dtype = 'int64')
    attention_mask = torch.tensor(attention_mask) 

    input_ids = input_ids.reshape(1,120).to(device)
    attention_mask = attention_mask.to(device)

    outputs = model(input_ids=input_ids, attention_mask=attention_mask)

    outputs = outputs[0][0].cpu().detach()

    probs = F.softmax(outputs, dim=-1).cpu().detach().numpy().tolist()
    _, prediction = torch.max(outputs, dim =-1)

    return probs[1], probs[0]

    print("Positive score:", probs[1])
    print("Negative score:", probs[0])
    print(f'Review text: {review_text}')
    #print(f'Sentiment  : {class_names[prediction]}')

In [None]:
import torch.nn.functional as F
text = 'I am so happy'
#labels = [1]
predict_sentiment(text)

Positive score: 0.9979650974273682
Negative score: 0.002034929348155856
Review text: I am so happy


In [None]:
predict_sentiment('I feel sad')

Positive score: 0.004747556056827307
Negative score: 0.9952524900436401
Review text: I feel sad


In [None]:
predict_sentiment('@blackhitop all i can say is wow!!! beautiful as always. have an awesome week!!!,tsinspired')

Positive score: 0.9974767565727234
Negative score: 0.002523185219615698
Review text: @blackhitop all i can say is wow!!! beautiful as always. have an awesome week!!!,tsinspired


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
rephraser_tokenizer = AutoTokenizer.from_pretrained("Vamsi/T5_Paraphrase_Paws")  
rephraser_model = AutoModelForSeq2SeqLM.from_pretrained("Vamsi/T5_Paraphrase_Paws")

Downloading:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.74k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/850M [00:00<?, ?B/s]

In [None]:
def get_rephrased(tokenizer, model, text):
  encoding = tokenizer.encode_plus(text,padding='max_length', return_tensors="pt")
  input_ids, attention_masks = encoding["input_ids"], encoding["attention_mask"]
  outputs = model.generate(
      input_ids=input_ids, attention_mask=attention_masks,
      max_length=256,
      do_sample=True,
      top_k=120,
      top_p=0.95,
      early_stopping=True,
      num_return_sequences=15
  )
  lines = []
  for output in outputs:
      line = tokenizer.decode(output, skip_special_tokens=True,clean_up_tokenization_spaces=True)
      lines.append(line)
  return lines

In [None]:
s = 'fucking embarrassing. this team needs to get its fucking shit together. goddammit.'
text =  "paraphrase: " + s + " </s>"

result = get_rephrased(rephraser_tokenizer, rephraser_model, text)
print(result)

['This team needs to get his fucking shit together.', 'The team needs to get its fucking shit together.  Thanks everyone!', '... This team needs to get the Fucking Sick together.', 'This team needs to strut it together.', 'The fucking is embarrassing... this team needs to get their fucking shit together, goddammit!', 'This team must get its fucked shit together.', 'Fucking embarrassing: this team has to get together his fucking shit.', 'This team needs to get its fucking shit together.', 'The team needs to get together a cracking smack to get to it.', 'This team needs to bring their fucking shit together!', 'Those are embarrassing. This team needs to put its fucking shit together.', 'Fucking is embarrassing. This team needs to get the Fucking Shit together. Gotta do this.', 'This team needs to get its fucking shit together. goddammit.', 'fucking embarrassing. this team needs to get together her fucking shits', 'This team needs to get its fucking shit together. goddammit.']


In [None]:
for r in result:
  print(predict_sentiment(r))

Positive score: 0.0022140678483992815
Negative score: 0.9977859258651733
Review text: This team needs to get his fucking shit together.
None
Positive score: 0.010424972511827946
Negative score: 0.9895750880241394
Review text: The team needs to get its fucking shit together.  Thanks everyone!
None
Positive score: 0.0011630168883129954
Negative score: 0.9988370537757874
Review text: ... This team needs to get the Fucking Sick together.
None
Positive score: 0.9426029324531555
Negative score: 0.05739705264568329
Review text: This team needs to strut it together.
None
Positive score: 0.0015606119995936751
Negative score: 0.998439371585846
Review text: The fucking is embarrassing... this team needs to get their fucking shit together, goddammit!
None
Positive score: 0.0011853836476802826
Negative score: 0.998814582824707
Review text: This team must get its fucked shit together.
None
Positive score: 0.0010946408146992326
Negative score: 0.9989053010940552
Review text: Fucking embarrassing: thi

# Result

## Load Data

In [None]:
def get_opt(path, pess_threshold = -1, opt_threshold = 1):    
    #path = 'data/tweets_annotation.csv'
    df = pd.read_csv(path)
  
    if pess_threshold == opt_threshold:
      conditions = [
        (df['AverageAnnotation'] <= pess_threshold),
        (df['AverageAnnotation'] > opt_threshold)
      ]
      values = ['pessimistic','optimistic']
      target =[0,1]
    elif pess_threshold < opt_threshold:
      conditions = [
        (df['AverageAnnotation'] >= opt_threshold),
        (df['AverageAnnotation'] <= pess_threshold),
        (df['AverageAnnotation'] < opt_threshold) & (df['AverageAnnotation'] > pess_threshold)
      ]
      values = ['optimistic','pessimistic','neutral']
      target = [1,0,None]
    else:
      print('The pessimistic error is greater than optimistic threshold')
      return
    
    df['Label'] = np.select(conditions,values)
    df['Target'] = np.select(conditions,target)  
    filtered_df = df.loc[(df['Label'] != 'neutral'),df.columns]
    filtered_df.astype({'Target': 'int32'}).dtypes
    filtered_df = filtered_df.reset_index(drop=True)
  
    return filtered_df

In [None]:
def get_tsa(path):
    
    #path = 'data/Sentiment-Analysis-Dataset/Sentiment Analysis Dataset.csv'
    with open(path,'rb') as file:
        lines = file.readlines()
        #header = lines[0].decode('utf-8').rstrip().split(',')
        header = lines[0].rstrip().decode('utf-8').split(',')
        data_list = []
        for line in lines[1:]:
            line = line.rstrip()
            props = line.decode('utf-8').split(',')
            if len(props) > 4:
                sentiment_text = ','.join(props[3:]).strip('"')
                data = props[:3]
                data.append(sentiment_text)
            else:
                data = props
            data_list.append(data)
            # test index = 4286
    df = pd.DataFrame(data_list, columns = header)
    return df

In [None]:
import pandas as pd
df = get_tsa('/content/drive/MyDrive/1027/Sentiment Analysis Dataset.csv')

In [None]:
import pandas as pd
df_opt = get_opt('/content/drive/MyDrive/1027/tweets_annotation.csv')

In [None]:
filtered_df = df_opt.loc[(df_opt['Label'] == 'pessimistic'),df_opt.columns]

## Apply prediction

In [None]:
def paraphrase_K(pess_tweets):
  for pess_tweet in pess_tweets:
    main_positive, main_negative = predict_sentiment(pess_tweet)
    int_paraphrased_tweets = get_rephrased(rephraser_tokenizer, rephraser_model, pess_tweet)
    for int_paraphrased_tweet in int_paraphrased_tweets:
      positive, negative = predict_sentiment(int_paraphrased_tweet)
      if positive-main_positive > 0.5:
        print("Original tweet: ", pess_tweet)
        print("Original positive %:",main_positive*100)
        print("Paraphrased tweet: ", int_paraphrased_tweet)
        print("Paraphrased positive %:",positive*100)
        print()

In [None]:
paraphrase_K(filtered_df['Tweet'])

Original tweet:  seattle your a fucking mess. get it together. please. your embarrassing us
Original positive %: 0.5319199059158564
Paraphrased tweet:  Please join me in sharing your embarrassing mess seattle by sending it up to you.
Paraphrased positive %: 77.92918682098389

Original tweet:  like.. why you has no drive or ambition?
Original positive %: 0.20106101874262094
Paraphrased tweet:  You always want to please us by leaving behind your drive or ambition?
Paraphrased positive %: 82.56936073303223

Original tweet:  all eht extra shit for the birds buddy .
Original positive %: 0.9263070300221443
Paraphrased tweet:  All eht adds one extra bit for the bird - buddy.
Paraphrased positive %: 99.38238859176636

Original tweet:  all eht extra shit for the birds buddy .
Original positive %: 0.9263070300221443
Paraphrased tweet:  All eht extra haut for the birds buddy.
Paraphrased positive %: 97.96828627586365

Original tweet:  this episode of ahs holy shit
Original positive %: 0.988777913

In [None]:
def paraphrase_K(pess_tweets):
  results = []
  for pess_tweet in pess_tweets:
    main_positive, main_negative = predict_sentiment(pess_tweet)
    result.append([main_positive,main_negative])
    int_paraphrased_tweets = get_rephrased(rephraser_tokenizer, rephraser_model, pess_tweet)
    for int_paraphrased_tweet in int_paraphrased_tweets:      
      positive, negative = predict_sentiment(int_paraphrased_tweet)
      paraphrased_tweet = []
      if positive > man_positive:
        paraphrased_tweet.append({
            'paraphrased': int_paraphrased_tweet,
            'difference': positive-main_positive,
            'positive': positive,
            'negative': negative
        })  
    if len(paraphrased_tweet) == 0:
      result = {
          'initial': pess_tweet,
          'result' : False,
          'detail': paraphrased_tweet,
      }
    else:
      result = {
          'initial': pess_tweet,
          'result' : True,
          'detail': paraphrased_tweet,
      }
    results.append(result)

In [None]:
def opt_paraphrase(df):
  results = []
  for i in range(len(df)):
    result = []
    result.append(i)
    pess_tweet = df.iloc[i]['SentimentText'].strip()
    main_positive, main_negative = predict_sentiment(pess_tweet)
    #result.append([main_positive,main_negative])
    if main_negative > main_positive:
      label = 'pess'
      int_paraphrased_tweets = get_rephrased(rephraser_tokenizer, rephraser_model, pess_tweet)
      for int_paraphrased_tweet in int_paraphrased_tweets:
        positive, negative = predict_sentiment(int_paraphrased_tweet)
        paraphrased_tweet = []
        if positive > main_positive:
          paraphrased_tweet.append([
              int_paraphrased_tweet,
              positive-main_positive,
              positive,
              negative
          ])
      paraphrase_result = False if len(paraphrased_tweet) == 0 else True

      result = [
          i,
          pess_tweet,
          df.iloc[i]['Sentiment'],
          main_positive,
          main_negative,
          label,
          paraphrase_result,
          paraphrased_tweet
      ]
    else:
      label = 'opt'
      result = [
          i,
          pess_tweet,
          df.iloc[i]['Sentiment'],
          main_positive,
          main_negative,
          label,
          False,
          [],
      ]
    results.append(result)
  return results

In [None]:
prec = opt_paraphrase(df[:2])

In [None]:
df[:2]

Unnamed: 0,﻿ItemID,Sentiment,SentimentSource,SentimentText
0,1,0,Sentiment140,is so sad for my APL frie...
1,2,0,Sentiment140,I missed the New Moon trail...


In [None]:
prec

[[0,
  'is so sad for my APL friend.............',
  '0',
  0.0005536296521313488,
  0.9994463324546814,
  'pess',
  True,
  [['It is so sad for my APL friend.....................',
    2.154591493308544e-05,
    0.0005751755670644343,
    0.9994248151779175]]],
 [1,
  'I missed the New Moon trailer...',
  '0',
  0.9990065693855286,
  0.0009934039553627372,
  'opt',
  False,
  []]]

In [None]:
prediction = opt_paraphrase(df)