In [None]:
pip install tokenizers

In [None]:
pip install transformers

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow.keras.backend as K
import tokenizers
from transformers import RobertaTokenizer, TFRobertaModel
import re
from collections import Counter
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings("ignore")

In [None]:
MODEL_NAME = 'roberta-base'
MAX_LEN = 50
tokenizer = RobertaTokenizer.from_pretrained(MODEL_NAME)
roberta_model = TFRobertaModel.from_pretrained(MODEL_NAME)

In [None]:
# this function takes text as an input and provides prediction either negative,neutral or positive
def sentiment_analyzer():
    t=input('Please Enter Text: ')
    def decontracted(phrase):
        # specific
        phrase = re.sub(r"won\'t", "will not", phrase)
        phrase = re.sub(r"can\'t", "can not", phrase)

        # general
        phrase = re.sub(r"n\'t", " not", phrase)
        phrase = re.sub(r"\'re", " are", phrase)
        phrase = re.sub(r"\'s", " is", phrase)
        phrase = re.sub(r"\'d", " would", phrase)
        phrase = re.sub(r"\'ll", " will", phrase)
        phrase = re.sub(r"\'t", " not", phrase)
        phrase = re.sub(r"\'ve", " have", phrase)
        phrase = re.sub(r"\'m", " am", phrase)
        return phrase
    def preprocess(txt):
        '''This function pre processes given text data'''
        txt=txt.replace('&amp;','and')
        txt=re.sub(r'http\S+', '', txt)
        txt=decontracted(txt)
        txt=" ".join(filter(lambda x:x[0]!='@', txt.split()))
        txt=" ".join(filter(lambda x:x[0]!='#', txt.split()))
        txt=" ".join(filter(lambda x:x[-4:]!='.com', txt.split()))
        txt=re.sub('[^A-Za-z ]+','', txt)

        return txt.lower().strip()
    t=decontracted(t)
    t=preprocess(t)
    def tokenization_(dataset):
        allowed=48
        dataset_tokens=[]
        dataset_mask=[]
        dataset_segment=[]
        for i in range(len(dataset)):
            mask=[]
            tokenized=tokenizer.tokenize(dataset[i])
            while len(tokenized)<allowed:
                tokenized.append('[PAD]')
            while len(tokenized)>allowed:
                del tokenized[len(tokenized)-1]
            if len(tokenized)==allowed:
                tokens=['[CLS]',*tokenized,'[SEP]']

            for j in tokens:
                if j=='[PAD]':
                    mask.append(0)
                elif j!='[PAD]':
                    mask.append(1)

            segment=np.array([0]*50)
            token_ids=tokenizer.convert_tokens_to_ids(tokens)
            dataset_tokens.append(token_ids)
            dataset_mask.append(mask)
            dataset_segment.append(segment)
        return np.array(dataset_tokens),np.array(dataset_mask),np.array(dataset_segment)
    x,y,z=tokenization_([t])
    X={
        'input_word_ids': x,
        'input_mask': y,
        'input_type_ids': z
        }
    input_word_ids = tf.keras.Input(shape=(MAX_LEN,), dtype=tf.int32, name='input_word_ids')
    input_mask = tf.keras.Input(shape=(MAX_LEN,), dtype=tf.int32, name='input_mask')
    input_type_ids = tf.keras.Input(shape=(MAX_LEN,), dtype=tf.int32, name='input_type_ids')

        
    x = roberta_model(input_word_ids, attention_mask=input_mask, token_type_ids=input_type_ids)

    x = x[0]

    x = tf.keras.layers.Dropout(0.1)(x)
    x = tf.keras.layers.Flatten()(x)
    x = tf.keras.layers.Dense(256, activation='relu')(x)
    x = tf.keras.layers.Dense(3, activation='softmax')(x)

    model = tf.keras.Model(inputs=[input_word_ids, input_mask, input_type_ids], outputs=x)
    model.compile(optimizer=tf.keras.optimizers.Adam(lr=1e-5),loss='categorical_crossentropy',metrics=['accuracy'])
    model.load_weights('/content/drive/MyDrive/roberta.h5')
    pred=model.predict(X)
    i=np.argmax(pred)
    if i==0:
        return 'Given Sentence is Negative'
    elif i==1:
        return 'Given Sentence is Neutral'
    elif i==2:
        return 'Given Sentence is Positive'


In [None]:
sentiment_analyzer()

Please Enter Text: i am sad


'Given Sentence is Negative'

In [None]:
sentiment_analyzer()

Please Enter Text: i am happy to hear that


'Given Sentence is Positive'

In [None]:
sentiment_analyzer()

Please Enter Text: he is in pain


'Given Sentence is Negative'

In [None]:
sentiment_analyzer()


Please Enter Text: i indirectly liked it


'Given Sentence is Positive'

In [None]:
sentiment_analyzer()

Please Enter Text: American biographical drama film directed by Gabriele Muccino and starring Will Smith as Chris Gardner, a homeless salesman. Smith's son Jaden Smith co-stars, making his film debut as Gardner's son, Christopher Jr. The screenplay by Steven Conrad is based on the best-selling 2006 memoir of the same name written by Gardner with Quincy Troupe.


'Given Sentence is Positive'

In [None]:
sentiment_analyzer()

Please Enter Text: While Gardner is trying to sell one of the scanners, he meets Jay Twistle


'Given Sentence is Positive'

In [None]:
sentiment_analyzer()

Please Enter Text: Gardner's unpaid internship does not please Linda, who eventually leaves for New York,


'Given Sentence is Negative'

In [None]:
# this function takes text and actual class label from user and gives accuracy score with model's prediction about text 
def Performance_sentiment_analyzer():
    t=input('Please Enter Text: ')
    target=input('Please mention sentiment\n 0:Negative\n 1:Neutral\n 2:Positive: ')
    while target.isdigit()==False:
        target=input('Please mention sentiment\n 0:Negative\n 1:Neutral\n 2:Positive: ')
    def decontracted(phrase):
        # specific
        phrase = re.sub(r"won\'t", "will not", phrase)
        phrase = re.sub(r"can\'t", "can not", phrase)

        # general
        phrase = re.sub(r"n\'t", " not", phrase)
        phrase = re.sub(r"\'re", " are", phrase)
        phrase = re.sub(r"\'s", " is", phrase)
        phrase = re.sub(r"\'d", " would", phrase)
        phrase = re.sub(r"\'ll", " will", phrase)
        phrase = re.sub(r"\'t", " not", phrase)
        phrase = re.sub(r"\'ve", " have", phrase)
        phrase = re.sub(r"\'m", " am", phrase)
        return phrase
    def preprocess(txt):
        '''This function pre processes given text data'''
        txt=txt.replace('&amp;','and')
        txt=re.sub(r'http\S+', '', txt)
        txt=decontracted(txt)
        txt=" ".join(filter(lambda x:x[0]!='@', txt.split()))
        txt=" ".join(filter(lambda x:x[0]!='#', txt.split()))
        txt=" ".join(filter(lambda x:x[-4:]!='.com', txt.split()))
        txt=re.sub('[^A-Za-z ]+','', txt)

        return txt.lower().strip()
    t=decontracted(t)
    t=preprocess(t)
    def tokenization_(dataset):
        allowed=48
        dataset_tokens=[]
        dataset_mask=[]
        dataset_segment=[]
        for i in range(len(dataset)):
            mask=[]
            tokenized=tokenizer.tokenize(dataset[i])
            while len(tokenized)<allowed:
                tokenized.append('[PAD]')
            while len(tokenized)>allowed:
                del tokenized[len(tokenized)-1]
            if len(tokenized)==allowed:
                tokens=['[CLS]',*tokenized,'[SEP]']

            for j in tokens:
                if j=='[PAD]':
                    mask.append(0)
                elif j!='[PAD]':
                    mask.append(1)

            segment=np.array([0]*50)
            token_ids=tokenizer.convert_tokens_to_ids(tokens)
            dataset_tokens.append(token_ids)
            dataset_mask.append(mask)
            dataset_segment.append(segment)
        return np.array(dataset_tokens),np.array(dataset_mask),np.array(dataset_segment)
    x,y,z=tokenization_([t])
    X={
        'input_word_ids': x,
        'input_mask': y,
        'input_type_ids': z
        }
    input_word_ids = tf.keras.Input(shape=(MAX_LEN,), dtype=tf.int32, name='input_word_ids')
    input_mask = tf.keras.Input(shape=(MAX_LEN,), dtype=tf.int32, name='input_mask')
    input_type_ids = tf.keras.Input(shape=(MAX_LEN,), dtype=tf.int32, name='input_type_ids')

        
    x = roberta_model(input_word_ids, attention_mask=input_mask, token_type_ids=input_type_ids)

    x = x[0]

    x = tf.keras.layers.Dropout(0.1)(x)
    x = tf.keras.layers.Flatten()(x)
    x = tf.keras.layers.Dense(256, activation='relu')(x)
        #x=RandomFourierFeatures(output_dim=4090, scale=10.0, kernel_initializer="gaussian")(x)

    x = tf.keras.layers.Dense(3, activation='softmax')(x)

    model = tf.keras.Model(inputs=[input_word_ids, input_mask, input_type_ids], outputs=x)
    model.compile(optimizer=tf.keras.optimizers.Adam(lr=1e-5),loss='categorical_crossentropy',metrics=['accuracy'])
    model.load_weights('/content/drive/MyDrive/roberta.h5')
    pred=model.predict(X)
    pred=np.argmax(pred)
    if int(target)==int(pred):
        acc=1
    elif int(target)!=int(pred):
        acc=0
    print('Accuracy score is {}'.format(acc))

In [None]:
Performance_sentiment_analyzer()

Please Enter Text: Gardner's unpaid internship does not please Linda, who eventually leaves for New York,
Please mention sentiment
 0:Negative
 1:Neutral
 2:Positive: 0
Accuracy score is 1


In [None]:
Performance_sentiment_analyzer()

Please Enter Text: Gardner's unpaid internship does not please Linda, who eventually leaves for New York,
Please mention sentiment
 0:Negative
 1:Neutral
 2:Positive: 
Please mention sentiment
 0:Negative
 1:Neutral
 2:Positive: 2
Accuracy score is 0
