### Download and import packages

In [32]:
'''
Installing dependencies...
'''
# uninstall
!pip uninstall -y wandb
# download
!pip install transformers
!pip install google-play-scraper
#----------------------------------------------------------------------------------------------------------------------------------------------------------------
'''
Importing modules
'''
import re
import json
import torch
import random
import pandas as pd
from tqdm import tqdm
from torch.utils.data import Dataset
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from google_play_scraper import app,Sort,reviews_all
import pandas as pd
import numpy as np
from google_play_scraper import Sort, reviews



In [33]:
result, continuation_token = reviews(
    'cris.org.in.prs.ima', 
    lang='en', 
    country='us',
    count=10000,
    sort=Sort.NEWEST, 
)
#Assembling reviews into a pandas dataframe for better pipelining
df_review=pd.DataFrame(np.array(result),columns=['review'])
df_review=df_review.join(pd.DataFrame(df_review.pop('review').tolist()))
features=df_review['content']
targets=df_review['score']
features=list(features)
targets=list(targets)
new_targets=[]
num_pos=0
num_neg=0
for rows in range(len(targets)):
    temp_score=targets[rows]
    if temp_score>=3:
        new_targets.append(1)
        num_pos+=1
    else:
        new_targets.append(0)
        num_neg+=1
features=list(features)
targets=list(new_targets)

### Dataset load and prep functions

In [34]:
# Dataset class
class SentimentDataset(Dataset):
    def __init__(self, txt_list, label_list, tokenizer, max_length):
        # define variables    
        self.input_ids = []
        self.attn_masks = []
        self.labels = []
        map_label = {0:'negative', 1: 'positive'}
        # iterate through the dataset
        for txt, label in zip(txt_list, label_list):
            # prepare the text
            prep_txt = f'<|startoftext|>Review: {txt}\nSentiment: {map_label[label]}<|endoftext|>'
            # tokenize
            encodings_dict = tokenizer(prep_txt, truncation=True,
                                       max_length=max_length, padding="max_length")
            # append to list
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))
            self.labels.append(map_label[label])

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx], self.labels[idx]

# Data load function
def load_sentiment_dataset(tokenizer, random_seed = 42, file_path="../input/sentiment140/training.1600000.processed.noemoticon.csv"):
    
    df=pd.DataFrame({
        "text":features,
        "label":targets
        
    })
    def pick_first_n_words(string, max_words=250): # tried a few max_words, kept 250 as max tokens was < 512
        split_str = string.split()
        return " ".join(split_str[:min(len(split_str), max_words)])

    df['text'] = df['text'].apply(lambda x: pick_first_n_words(x))
    
    # divide into test and train
    X_train, X_test, y_train, y_test = \
              train_test_split(df['text'].tolist(), df['label'].tolist(),
              shuffle=True, test_size=0.05, random_state=random_seed, stratify=df['label'])

    # get max length
    max_length_train = max([len(tokenizer.encode(text)) for text in X_train])
    max_length_test = max([len(tokenizer.encode(text)) for text in X_test])
    max_length = max([max_length_train, max_length_test]) + 10  #for special tokens (sos and eos) and fillers
    max_length = max(max_length, 300)
    print(f"Setting max length as {max_length}")

    # format into SentimentDataset class
    train_dataset = SentimentDataset(X_train, y_train, tokenizer, max_length=max_length)

    # return
    return train_dataset, (X_test, y_test)

### Load model and tokenizer; Call data Prep

In [35]:
# import 
from torch.utils.data import Dataset, random_split
from transformers import GPT2Tokenizer, TrainingArguments, Trainer, GPT2LMHeadModel

# model
model_name = "gpt2"
seed = 42

# seed
torch.manual_seed(seed)

<torch._C.Generator at 0x7b2154199570>

In [37]:
features=features[:5000]
targets=targets[:5000]

In [39]:

model = GPT2LMHeadModel.from_pretrained(model_name).cuda()
model.resize_token_embeddings(len(tokenizer))

epochs=1
print("Loading model...")

# load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained(model_name, bos_token='<|startoftext|>',
                                          eos_token='<|endoftext|>', pad_token='<|pad|>')

for trial_no in range(epochs):
 
    model=model.train()
    print("Loading dataset...")
    train_dataset, test_dataset = load_sentiment_dataset(tokenizer)

    print("Epoch: {}".format(trial_no))
    training_args = TrainingArguments(output_dir='results', num_train_epochs=2, 
                                    logging_steps=10, load_best_model_at_end=True,
                                      save_strategy="epoch", per_device_train_batch_size=2, per_device_eval_batch_size=2,
                                    warmup_steps=100, weight_decay=0.01, logging_dir='logs')

    Trainer(model=model, args=training_args, train_dataset=train_dataset,
            eval_dataset=test_dataset, data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]),
                                                                  'attention_mask': torch.stack([f[1] for f in data]),
                                                                  'labels': torch.stack([f[0] for f in data])}).train()
    
    # test
    print("Start testing...")
    # eval mode on model
    model = model.eval()

    # compute prediction on test data
    original, predicted, all_text, predicted_text = [], [], [], []
    map_label = {0:'negative', 1: 'positive'}
    with torch.no_grad():
        for text, label in tqdm(zip(test_dataset[0], test_dataset[1])):
            # predict sentiment on test data
            prompt = f'<|startoftext|>Review: {text}\nSentiment:'
            generated = tokenizer(f"<|startoftext|> {prompt}", return_tensors="pt").input_ids.cuda()
            sample_outputs = model.generate(generated, do_sample=False, top_k=50, max_length=512, top_p=0.90, 
                    temperature=0, num_return_sequences=0)
            pred_text = tokenizer.decode(sample_outputs[0], skip_special_tokens=True)
            # extract the predicted sentiment
            try:
                pred_sentiment = re.findall("\nSentiment: (.*)", pred_text)[-1]
            except:
                pred_sentiment = "None"
            original.append(map_label[label])
            predicted.append(pred_sentiment)
            all_text.append(text)
            predicted_text.append(pred_text)
    #transform into dataframe
    df = pd.DataFrame({'text': all_text, 'predicted': predicted, 'original': original, 'predicted_text': predicted_text})
    df.to_csv(f"result_run_{trial_no}.csv", index=False)
    # compute f1 score
    print(f1_score(original, predicted, average='macro'))

Loading model...


Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.


Loading dataset...
Setting max length as 961
Epoch: 0


Step,Training Loss
10,8.5739
20,7.4065
30,4.1673
40,1.316
50,0.4844
60,0.1866
70,0.132
80,0.0689
90,0.1587
100,0.0954


0it [00:00, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
1it [00:00,  6.15it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Start testing...


2it [00:00,  6.52it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
3it [00:00,  6.13it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
4it [00:00,  6.71it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
5it [00:00,  6.40it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
6it [00:00,  6.30it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
7it [00:01,  6.37it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
8it [00:01,  6.35it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
9it [00:01,  6.18it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
10it [00:01,  6.20it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
11it [00:01,  6.10it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
12it [00:01,  6.17it/s]Setting `pad_token_id` to `eos_toke

0.9226059438635114





In [3]:
import torch
torch.save(model,'GPT-2-IRCTC.pt')

NameError: name 'model' is not defined

In [4]:
model

NameError: name 'model' is not defined