In [None]:
import numpy # linear algebra
import pandas as pd 
import tensorflow as tf# Get the GPU device name.
import torch# If there's a GPU available...
from transformers import BertForSequenceClassification, AdamW, BertConfig,BertTokenizer, BertModel, glue_convert_examples_to_features
from torch.utils.data import TensorDataset, random_split,DataLoader, RandomSampler, SequentialSampler
import tensorflow_datasets as tfds
from sklearn import metrics
import itertools
# SELECTING THE TOKENIZER, THE MODEL AND THE NUMBER OF EPOCHS   #######################################################################
Epochs=50


tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertForSequenceClassification.from_pretrained(
    "bert-base-multilingual-cased", 
    num_labels = 3,
    output_attentions = False,
    output_hidden_states = False, 
)



# DEFINING FUNCTIONS #######################################################################
def to_sentiment(rating):
    rating = int(rating)
    if rating == -1:
        return 0
    elif rating == 0:
        return 1
    elif rating == 1:
        return 2
    elif rating == 2:
        return 3
def flat_accuracy(preds, labels):
    p=[]
    for i in preds:
        i=i.cpu().detach().numpy()
        p.append(i.argmax())
    labels_flat = labels.flatten().cpu().numpy()
    return numpy.sum(p == labels_flat) / len(labels_flat)
def flat_accuracy_v2(preds, labels):
    p=[]
    for i in preds:
        i=i.cpu().detach().numpy()
        p.append(i.argmax())
    labels_flat = labels.flatten().cpu().numpy()
    
    return numpy.sum(p == labels_flat) / len(labels_flat),labels_flat,p

# SELECTING DATASETS FOR TRAINING AND TESTING #######################################################################
    
df_EN = pd.read_csv("../input/mbert-5-lang-absa/ABSA_MBERT_21 - English.csv")
df_HI = pd.read_csv("../input/mbert-5-lang-absa/Semeval16 Eng to HindiBangla and Tamil - Hindi.csv")
df = pd.concat([df_EN,df_HI],ignore_index=True)
df = df.sample(frac = 1) 


dftrain= pd.read_csv("../input/iiittweets/iIIT_tweets_train.csv")
dftest = pd.read_csv("../input/iiittweets/iIIT_tweets_test.csv")
dftest = pd.concat([dftest,dftrain],ignore_index=True)
# dftest = pd.read_csv("../input/iiittweets/I-IIT NLP annotations - Concatenated Cleaned.csv")
dftest['sentiment'] = dftest.sentiment.apply(to_sentiment)
# ====================================================================================


if torch.cuda.is_available():        # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")    
    print('There are %d GPU(s) available.' % torch.cuda.device_count())    
    print('We will use the GPU:', torch.cuda.get_device_name(0))# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")
    
model.to(device)
    
l=[]
c=0
for i in df.reviews:
    for k in i:
        if k==" ":
            c+=1
    l.append(c)
#     print(c,i)
    c=0
mlen=max(l)+5



df['sentiment'] = df.sentiment.apply(to_sentiment)
class_names = ['negative', 'neutral', 'positive']

# ENCODING THE TRAIN DATA #######################################################################
input_ids = []
attention_masks = []


for sent,asp in zip(df['reviews'],df['aspect']):
    encoded_dict = tokenizer.encode_plus(
                        sent,
                        asp,
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = min(mlen,270),           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
       
    input_ids.append(encoded_dict['input_ids'])
    
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(df['sentiment'])

# CREATING TRAIN DATALOADERS #######################################################################

dataset = TensorDataset(input_ids, attention_masks, labels)

train_size = int(0.65 * len(dataset))
val_size = len(dataset) - train_size

# Divide the dataset by randomly selecting samples.
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))



batch_size = 16
train_dataloader = DataLoader(
            train_dataset,  # The training samples.
            sampler = RandomSampler(train_dataset), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )

validation_dataloader = DataLoader(
            val_dataset, # The validation samples.
            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

# ENCODING TEST DATA AND CREATING TEST DATALOADERS #######################################################################

#  #######################################################################

TD_input_ids = []
TD_attention_masks = []

l=[]
c=0
for i in dftest.tweet:
    for k in i:
        if k==" ":
            c+=1
    l.append(c)
#     print(c,i)
    c=0
mlen=max(l)+5


for sent,asp in zip(dftest['tweet'],dftest['brand']):
    encoded_dict = tokenizer.encode_plus(
                        sent,
                        asp,
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = min(mlen,270),           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
       
    TD_input_ids.append(encoded_dict['input_ids'])
    
    TD_attention_masks.append(encoded_dict['attention_mask'])

TD_input_ids = torch.cat(TD_input_ids, dim=0)
TD_attention_masks = torch.cat(TD_attention_masks, dim=0)
TD_labels = torch.tensor(dftest['sentiment'])
TD_dataset = TensorDataset(TD_input_ids,TD_attention_masks, TD_labels)

# TD_dataloader = DataLoader(
#             TD_dataset,  
#             sampler = RandomSampler(TD_dataset), # Select batches randomly
#             batch_size = batch_size # Trains with this batch size.
#         )

# TRAINING PHASE #######################################################################


# dltest=TD_dataloader

TD_train_size = int(0.70 * len(TD_dataset))
TD_val_size = len(TD_dataset) - TD_train_size

# Divide the dataset by randomly selecting samples.
TD_train_dataset, TD_val_dataset = random_split(TD_dataset, [TD_train_size, TD_val_size])

print('{:>5,} training samples'.format(TD_train_size))
print('{:>5,} validation samples'.format(TD_val_size))



batch_size = 16
TD_train_dataloader = DataLoader(
            TD_train_dataset,  # The training samples.
            sampler = RandomSampler(TD_train_dataset), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )

TD_validation_dataloader = DataLoader(
            TD_val_dataset, # The validation samples.
            sampler = SequentialSampler(TD_val_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

# --------------------------------------------------------------------------------------




acc=[]
optim = AdamW(model.parameters(), lr=5e-6)
model.eval()
test_res=[]
for batch in validation_dataloader:
    input_ids = batch[0].to(device)
    attention_mask = batch[1].to(device)
    labels = batch[2].to(device)
    outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
    test_res.append(flat_accuracy(outputs[1],labels))
print("UNTUNED ACCURACY==>",sum(test_res)/len(test_res))
print("UNTUNED ACCURACY==>",sum(test_res)/len(test_res))


score_f1=[]
acc_s=[]
in_sent=[]
y_gold=[]
y_predicted=[]
score_all=[]

for epoch in range(Epochs):
    print("Epoch:",epoch+1," of ",Epochs)
    c=0
    l=len(TD_train_dataloader)+len(train_dataloader)
    model.train()
#     for layer in list(model.parameters())[:-1]:
#         layer.requires_grad = False
#         print(layer)
    train_res=[]
    
    model.train()
    for batch in train_dataloader:
        c+=1
        
#         print("Epoch:",epoch+1,"Running ",c," of ",l)
        print("Progress {:2.1%}".format(c/ l), end="\r")
        optim.zero_grad()
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        train_res.append(flat_accuracy(outputs[1],labels))
        loss.backward()
        optim.step()
    model.train()
    for batch in TD_train_dataloader:
        c+=1
        
#         print("Epoch:",epoch+1,"Running ",c," of ",l)
        print("Progress {:2.1%}".format(c/ l), end="\r")
        optim.zero_grad()
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        train_res.append(flat_accuracy(outputs[1],labels))
        loss.backward()
        optim.step()
    print("TRAIN ACCURACY==>",sum(train_res)/len(train_res))
    model.eval()
#     test_res=[]
#     for batch in validation_dataloader:
#         input_ids = batch[0].to(device)
#         attention_mask = batch[1].to(device)
#         labels = batch[2].to(device)
#         outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
#         test_res.append(flat_accuracy(outputs[1],labels))
#     print("VAL ACCURACY==>",sum(test_res)/len(test_res))
#     acc.append(sum(test_res)/len(test_res))  
    model.eval()
    test_res=[]
    label_=[]
    predict_=[]
    ll,pp=[],[]

    in_sen=[]
    y_gl=[]
    y_pr=[]
    for batch in TD_validation_dataloader:
        
        
        
        
        label_.append(batch[2])
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        predict_.append(flat_accuracy_v2(outputs[1],labels)[2])
        test_res.append(flat_accuracy_v2(outputs[1],labels)[0])
        
        inp=[]
        v=batch[0].cpu().numpy()
        for i in v:
            input_sen=tokenizer.convert_ids_to_tokens(i, skip_special_tokens= 'True')
            input_sen=tokenizer.convert_tokens_to_string(input_sen)
            inp.append(input_sen)
        in_sen.append(inp)
    
    in_sent.append(in_sen)
    
    
#     print(test_lang," ACCURACY==>",sum(test_res)/len(test_res))
    acc_s.append(sum(test_res)/len(test_res))

    for i in label_:
        for m in i:
            ll.append(m)
    ll=numpy.array(ll)

    for i in predict_:
        for m in i:
            pp.append(m)
    pp=numpy.array(pp) 
    
    y_gold.append(ll)
    y_predicted.append(pp)
    
    x=metrics.classification_report(ll,pp, digits=4, zero_division=0)
    print(" TEST ACCURACY:\n ",x)
    score_all.append(x)
    macrof1=metrics.f1_score(ll,pp, average='macro', sample_weight=None, zero_division=0)
#     print("Macro avg F1 ",macrof1)
    score_f1.append(macrof1)
     
# print(acc_s.index(max(acc_s))+1)    
# print(len(in_sent),len(y_gold),len(y_/predicted),len(score_f1))

#  #######################################################################

idx=score_f1.index(max(score_f1))
df=pd.DataFrame()
df["Input"]=list(itertools.chain(*in_sent[idx]))
df["y_gold"]=y_gold[idx]
df["y_predicted"]=y_predicted[idx]

# name='Train'+train_lang+'Test'+test_lang+'.csv'
# df.to_csv(name)

print(score_all[idx])

In [None]:
sent="Still having issues (Screen Flickering) Very #Bad Exp #Pune #Xiaomi @XiaomiIndia"
rep="XiaomiIndia"


encoded_dict = tokenizer.encode_plus(
                        sent,rep,
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 40,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
       
input_id=(encoded_dict['input_ids'])

attention_mask=(encoded_dict['attention_mask'])

input_id1 = input_id.to(device)
attention_mask1 = attention_mask.to(device)
outputs = model(input_id1, attention_mask=attention_mask1)
# for i in outputs:
#     print(i)
# k=(outputs.argmax())
print(outputs[0].argmax())

In [None]:
a,b,cx,t=[],[],[],[]
import pandas as pd
df = pd.read_json (r'../input/iiit-json/evaluation_data (1).json')
# print (df)
# df.head()

c=0
for i,j,k in zip(df.doc_id,df.raw_text,df.company_extractions):
#     print(type(i),type(j),type(k))
    if(i[0]=='a'and len(k)!=0):
        print(len(k))
        for item in k:
            print(i,item,k[item],"\nLOOK->",k[item][0][0],k[item][0][1][0],"\n",type(k[item]))
            
            sent=k[item][0][0]
            rep=k[item][0][1][0]
            t.append(i)
            a.append(j)
            b.append(item)
            encoded_dict = tokenizer.encode_plus(
                                    rep,
                                    add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                                    max_length = 370,           # Pad & truncate all sentences.
                                    pad_to_max_length = True,
                                    return_attention_mask = True,   # Construct attn. masks.
                                    return_tensors = 'pt',     # Return pytorch tensors.
                               )

            input_id=(encoded_dict['input_ids'])

            attention_mask=(encoded_dict['attention_mask'])

            input_id1 = input_id.to(device)
            attention_mask1 = attention_mask.to(device)
            outputs = model(input_id1, attention_mask=attention_mask1)
            # for i in outputs:
            #     print(i)
            # k=(outputs.argmax())
#             print(outputs[0].argmax())
            
            cx.append(str(int(outputs[0].argmax())))
#             print(i,sent,rep,"\n",item,"\n",int(outputs[0].argmax()))
# print(c)

In [None]:
print(len(a),len(b),len(cx),len(t))
output=pd.DataFrame()
output["Text_ID"]=t
output["Text"]=a
output["Brand"]=b
output["Sentiment"]=cx
output.to_csv('output2_no_contex.csv')

In [None]:
a,b,cx,t=[],[],[],[]
import pandas as pd
df = pd.read_json (r'../input/iiit-json/evaluation_data (1).json')
# print (df)
# df.head()
al=[]
bl=[]
xl=[]
c=0
for i,j,k in zip(df.doc_id,df.raw_text,df.company_extractions):
#     print(type(i),type(j),type(k))
    if(i[0]=='a'and len(k)!=0):
#         print(len(k))
        for item in k:
            ll=[]
#             print(k[item])
            for x in k[item]:
                cdcd=""
                for v in x[1]:
#                 print(i,item,x[0],cdcd)
#                     print(item)
#                     print(i,v)
                    sent=v
    #                 rep=k[item][0][1][0]
                    t.append(i)
                    a.append(j)
                    b.append(item)
                    encoded_dict = tokenizer.encode_plus(
                                            sent,
                                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                                            max_length = 370,           # Pad & truncate all sentences.
                                            pad_to_max_length = True,
                                            return_attention_mask = True,   # Construct attn. masks.
                                            return_tensors = 'pt',     # Return pytorch tensors.
                                       )

                    input_id=(encoded_dict['input_ids'])

                    attention_mask=(encoded_dict['attention_mask'])

                    input_id1 = input_id.to(device)
                    attention_mask1 = attention_mask.to(device)
                    outputs = model(input_id1, attention_mask=attention_mask1)
                    # for i in outputs:
                    #     print(i)
                    # k=(outputs.argmax())
        #             print(outputs[0].argmax())

                    cx.append(str(int(outputs[0].argmax())))
#                     print(int(outputs[0].argmax()))
                    ll.append(int(outputs[0].argmax()))
            print(i,item,ll)
            al.append(i)
            bl.append(item)
            xl.append(ll)
            print("next item")
    # print(c)

In [None]:
output=pd.DataFrame()
output["Text_ID"]=al
output["Brand"]=bl
output["Sentiment"]=xl
output.to_csv('article_output2.csv')

In [None]:
a,b,cx,t=[],[],[],[]
import pandas as pd
df = pd.read_json (r'../input/iiit-json/evaluation_data (1).json')
# print (df)
# df.head()
al=[]
bl=[]
xl=[]
c=0
for i,j,k in zip(df.doc_id,df.raw_text,df.company_extractions):
#     print(type(i),type(j),type(k))
    if(i[0]=='a'and len(k)!=0):
#         print(len(k))
        for item in k:
            ll=[]
#             print(k[item])
            for x in k[item]:
                cdcd=""
                for v in x[1]:
#                 print(i,item,x[0],cdcd)
#                     print(item)
#                     print(i,v)
                    sent=v
    #                 rep=k[item][0][1][0]
                    t.append(i)
                    a.append(j)
                    b.append(item)
                    encoded_dict = tokenizer.encode_plus(
                                            sent,
                                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                                            max_length = 370,           # Pad & truncate all sentences.
                                            pad_to_max_length = True,
                                            return_attention_mask = True,   # Construct attn. masks.
                                            return_tensors = 'pt',     # Return pytorch tensors.
                                       )

                    input_id=(encoded_dict['input_ids'])

                    attention_mask=(encoded_dict['attention_mask'])

                    input_id1 = input_id.to(device)
                    attention_mask1 = attention_mask.to(device)
                    outputs = model(input_id1, attention_mask=attention_mask1)
                    # for i in outputs:
                    #     print(i)
                    # k=(outputs.argmax())
        #             print(outputs[0].argmax())

                    cx.append(str(int(outputs[0].argmax())))
#                     print(int(outputs[0].argmax()))
                    ll.append(int(outputs[0].argmax()))
            print(i,item,ll)
            al.append(i)
            bl.append(item)
            xl.append(ll)
            print("next item")

In [None]:
data_mobt= pd.read_csv("../input/iiit-predictions/eval_preds_mob_tech.csv")
data_mobt.head()

xcd=[]
dse=[]
for i,j in zip(data_mobt.Text_ID,data_mobt.Flag):
#     print(i,j)
    if (j==1):
        xcd.append(i)
print(len(xcd))

In [None]:
output=pd.DataFrame()
output["Text_ID"]=al
output["Brand"]=bl
output["Sentiment"]=xl
output.to_csv('tweet_raw_output2.csv')

In [None]:
vv,vvv,vvvv=[],[],[]
prf=[]
for ii,jj,kk in zip(output.Text_ID,output.Brand,output.Sentiment):
    if ii in xcd:
        zz=0
        tt=2
        
        if zz in kk:
            if(kk.count(zz)+1>=kk.count(tt)):
                px="Negative"
            elif(kk.count(tt)>0):
                px="Positive"
                
        elif tt in kk:
            
#             print(kk.count(tt))
            px="Positive"
        
        else:
            px="Neutral"
#             print("Neutral")
        print(ii,jj,px)
        vv.append(ii)
        vvv.append(jj)
        vvvv.append(px)
        prf.append(1)
dede=pd.DataFrame()
dede["Text_ID"]=vv
dede["Mobile_Tech_Flag_Predicted"]=prf
dede["Brands_Entity_Identified"]=vvv
dede["Sentiment_Identified"]=vvvv
dede.to_csv('output2_articles.csv')

