# **STAGE 1**....LOADING THE MULTI-DOMAIN BERT ASPECT EXTRACTION MODEL

**LOADING OUR ORIGINAL(INITIAL) MD-BERT_ASPECT_EXTRACTION(model_ATE) FOR FINE TUNING WITH FEATURES**

In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.35.2-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m20.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.19.3-py3-none-any.whl (311 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.2/311.2 kB[0m [31m39.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.19,>=0.14 (from transformers)
  Downloading tokenizers-0.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m54.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m65.4 MB/s[0m eta [36m0:00:00[0m
Ins

In [2]:
from transformers import BertModel
import torch

from torch.utils.data import DataLoader, ConcatDataset, Dataset
from transformers import BertTokenizer
import torch
from torch.nn.utils.rnn import pad_sequence
import pandas as pd
import time
import numpy as np
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [3]:
class dataset_ATM(Dataset):
    def __init__(self, df, tokenizer):
        self.df = df  #dataframe to hold the data to be used as the dataset
        self.tokenizer = tokenizer #tokenizer object for input text tokenization

    def __getitem__(self, idx):
        tokens, tags, pols = self.df.iloc[idx, :3].values

        tokens = tokens.replace("'", "").strip("][").split(', ')
        tags = tags.strip('][').split(', ')
        pols = pols.strip('][').split(', ')

        bert_tokens = []
        bert_tags = []
        bert_pols = []
        for i in range(len(tokens)):
            t = self.tokenizer.tokenize(tokens[i])
            bert_tokens += t
            bert_tags += [int(tags[i])]*len(t)
            bert_pols += [int(pols[i])]*len(t)

        bert_ids = self.tokenizer.convert_tokens_to_ids(bert_tokens)

        ids_tensor = torch.tensor(bert_ids)
        tags_tensor = torch.tensor(bert_tags)
        pols_tensor = torch.tensor(bert_pols)

        return bert_tokens, ids_tensor, tags_tensor, pols_tensor

    def __len__(self):
        return len(self.df)

In [4]:
class bert_ATE(torch.nn.Module):
    def __init__(self, pretrain_model):
        super(bert_ATE, self).__init__()
        self.bert = BertModel.from_pretrained(pretrain_model)
        self.linear = torch.nn.Linear(self.bert.config.hidden_size, 3)
        self.loss_fn = torch.nn.CrossEntropyLoss()

    def forward(self, ids_tensors, tags_tensors, masks_tensors):
        outputs = self.bert(input_ids=ids_tensors, attention_mask=masks_tensors)

        # Access the last hidden state
        bert_outputs = outputs.last_hidden_state
        linear_outputs = self.linear(bert_outputs)
        linear_outputs = self.linear(bert_outputs)
        print("Type of bert output:", type(bert_outputs))

        if tags_tensors is not None:
            tags_tensors = tags_tensors.view(-1)
            linear_outputs = linear_outputs.view(-1,3)
            loss = self.loss_fn(linear_outputs, tags_tensors)
            return loss
        else:
            return linear_outputs

In [5]:
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
pretrain_model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(pretrain_model_name)
lr = 2e-5
model_ATE = bert_ATE(pretrain_model_name).to(DEVICE)
optimizer_ATE = torch.optim.Adam(model_ATE.parameters(), lr=lr)

(…)cased/resolve/main/tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

(…)bert-base-uncased/resolve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

(…)base-uncased/resolve/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

(…)rt-base-uncased/resolve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [6]:
def evl_time(t):
    min, sec= divmod(t, 60)
    hr, min = divmod(min, 60)
    return int(hr), int(min), int(sec)

def load_model(model, path):
    model.load_state_dict(torch.load(path), strict=False)
    return model

def save_model(model, name):
    torch.save(model.state_dict(), name)

In [7]:
laptops_train_ds = dataset_ATM(pd.read_csv("data/laptops_train.csv"), tokenizer)
laptops_test_ds = dataset_ATM(pd.read_csv("data/laptops_test.csv"), tokenizer)
restaurants_train_ds = dataset_ATM(pd.read_csv("data/restaurants_train.csv"), tokenizer)
restaurants_test_ds = dataset_ATM(pd.read_csv("data/restaurants_test.csv"), tokenizer)
twitter_train_ds = dataset_ATM(pd.read_csv("data/twitter_train.csv"), tokenizer)
twitter_test_ds = dataset_ATM(pd.read_csv("data/twitter_test.csv"), tokenizer)

In [8]:
train_ds = ConcatDataset([laptops_train_ds, restaurants_train_ds, twitter_train_ds])
test_ds = ConcatDataset([laptops_test_ds, restaurants_test_ds, twitter_test_ds])

In [9]:
def create_mini_batch(samples):
    ids_tensors = [s[1] for s in samples]
    ids_tensors = pad_sequence(ids_tensors, batch_first=True)

    tags_tensors = [s[2] for s in samples]
    tags_tensors = pad_sequence(tags_tensors, batch_first=True)

    pols_tensors = [s[3] for s in samples]
    pols_tensors = pad_sequence(pols_tensors, batch_first=True)

    masks_tensors = torch.zeros(ids_tensors.shape, dtype=torch.long)
    masks_tensors = masks_tensors.masked_fill(ids_tensors != 0, 1)

    datas = [ids_tensors, tags_tensors, masks_tensors]
    print("getting data type....",[ type(data) for data in datas])

    return ids_tensors, tags_tensors, pols_tensors, masks_tensors

In [10]:
train_loader = DataLoader(train_ds, batch_size=5, collate_fn=create_mini_batch, shuffle = True)
test_loader = DataLoader(test_ds, batch_size=50, collate_fn=create_mini_batch, shuffle = True)

In [11]:
def train_model_ATE(loader, epochs):
    all_data = len(loader)
    for epoch in range(epochs):
        finish_data = 0
        losses = []
        current_times = []
        correct_predictions = 0

        for data in loader:
            t0 = time.time()
            ids_tensors, tags_tensors, _, masks_tensors = data
            ids_tensors = ids_tensors.to(DEVICE)
            tags_tensors = tags_tensors.to(DEVICE)
            masks_tensors = masks_tensors.to(DEVICE)
            print("getting loadetr type", type(ids_tensors), type(tags_tensors), type(masks_tensors))

            loss = model_ATE(ids_tensors=ids_tensors, tags_tensors=tags_tensors, masks_tensors=masks_tensors)

            losses.append(loss.item())
            loss.backward()

In [12]:
def train_model_ATE(loader, epochs):
    all_data = len(loader)
    for epoch in range(epochs):
        finish_data = 0
        losses = []
        current_times = []
        correct_predictions = 0

        for data in loader:
            t0 = time.time()
            ids_tensors, tags_tensors, _, masks_tensors = data
            ids_tensors = ids_tensors.to(DEVICE)
            tags_tensors = tags_tensors.to(DEVICE)
            masks_tensors = masks_tensors.to(DEVICE)
            print("getting loadetr type", type(ids_tensors), type(tags_tensors), type(masks_tensors))

            loss = model_ATE(ids_tensors=ids_tensors, tags_tensors=tags_tensors, masks_tensors=masks_tensors)

            losses.append(loss.item())
            loss.backward()
            optimizer_ATE.step()
            optimizer_ATE.zero_grad()
            print("type of loss and losses", loss, losses, type(loss), type(losses))

            finish_data += 1
            current_times.append(round(time.time()-t0,3))
            current = np.mean(current_times)
            hr, min, sec = evl_time(current*(all_data-finish_data) + current*all_data*(epochs-epoch-1))
            # print('epoch:', epoch, " batch:", finish_data, "/" , all_data, " loss:", np.mean(losses), " hr:", hr, " min:", min," sec:", sec)

import joblib
joblib.dump(model_ATE, "model_joblib")
["model_joblib"]

['model_joblib']

In [13]:
def test_model_ATE(loader):
    pred = []
    trueth = []
    with torch.no_grad():
        for data in loader:

            ids_tensors, tags_tensors, _, masks_tensors = data
            ids_tensors = ids_tensors.to(DEVICE)
            tags_tensors = tags_tensors.to(DEVICE)
            masks_tensors = masks_tensors.to(DEVICE)

            outputs = model_ATE(ids_tensors=ids_tensors, tags_tensors=None, masks_tensors=masks_tensors)

            _, predictions = torch.max(outputs, dim=2)

            pred += list([int(j) for i in predictions for j in i ])
            trueth += list([int(j) for i in tags_tensors for j in i ])

    return trueth, pred

In [14]:
model_ATE=joblib.load("model_joblib")

**CONVERTING OUR PYTORCH MODEL TO TENSORFLOW FRAMEWORK**

In [15]:
import torch
import tensorflow as tf
from transformers import BertModel, TFBertModel, BertTokenizer
import joblib

# Load a pre-trained PyTorch BERT model
model_ATE = BertModel.from_pretrained("bert-base-uncased")

# Convert the PyTorch model to a TensorFlow model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
tensorflow_model = TFBertModel(model_ATE.config)

# Load the PyTorch model's weights into the TensorFlow model
state_dict = model_ATE.state_dict()
for tf_layer, (name, pytorch_tensor) in zip(tensorflow_model.trainable_variables, state_dict.items()):
    tf_layer.assign(pytorch_tensor.numpy())

# Save the TensorFlow model's weights and architecture separately
model_weights = tensorflow_model.get_weights()
model_config = tensorflow_model.get_config()

# Save the converted TensorFlow model's weights using joblib
joblib.dump(model_weights, "Tensorflow_converted_model_weights.pkl")

# Save the converted TensorFlow model's architecture using joblib
joblib.dump(model_config, "Tensorflow_converted_model_config.pkl")

# Now you can load the converted model as a TensorFlow model using joblib
loaded_model_weights = joblib.load("Tensorflow_converted_model_weights.pkl")
loaded_model_config = joblib.load("Tensorflow_converted_model_config.pkl")

# Reconstruct the TensorFlow model
loaded_model = TFBertModel.from_config(loaded_model_config)
loaded_model.set_weights(loaded_model_weights)


**LOADING THE CONVERTED TENSORFLOW MODEL(model_ATE)**

In [16]:
import joblib
from transformers import TFBertModel
import tensorflow as tf

# Load the saved model weights and architecture using joblib
loaded_model_weights = joblib.load("Tensorflow_converted_model_weights.pkl")
loaded_model_config = joblib.load("Tensorflow_converted_model_config.pkl")

# Create a new instance of the TensorFlow model using the loaded architecture
loaded_model = TFBertModel.from_config(loaded_model_config)

# Set the loaded model weights on the new TensorFlow model
loaded_model.set_weights(loaded_model_weights)

# Build the model or apply it to a batch of data before printing the summary
# You can use a dummy input to build the model
dummy_input = tf.constant([1], shape=(1, 1))  # Create a dummy input batch
_ = loaded_model(dummy_input)  # Apply the model to the dummy input

# Print the model summary
loaded_model.summary()


Model: "tf_bert_model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
Total params: 109482240 (417.64 MB)
Trainable params: 109482240 (417.64 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


# **STAGE 2**....FINE TUNING OUR MODEL USING SINGLE DOMAIN

***IMPORTING NECCESARY MODULES ***

In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

**DATA PREPROCESSING...(DATA LOADING, DATA SPLITTING, AND REMOVAL OF UNWANTED COLUMNS)**

**RESTAURANT_REVIEWS**

In [18]:
df_R =pd.read_csv("data/Restaurant reviews.csv")
df_R

Unnamed: 0,Restaurant,Reviewer,Review_restaurant,Rating,Metadata,Time,Pictures,7514
0,Beyond Flavours,Rusha Chakraborty,"The ambience was good, food was quite good . h...",5,"1 Review , 2 Followers",5/25/2019 15:54,0,2447.0
1,Beyond Flavours,Anusha Tirumalaneedi,Ambience is too good for a pleasant evening. S...,5,"3 Reviews , 2 Followers",5/25/2019 14:20,0,
2,Beyond Flavours,Ashok Shekhawat,A must try.. great food great ambience. Thnx f...,5,"2 Reviews , 3 Followers",5/24/2019 22:54,0,
3,Beyond Flavours,Swapnil Sarkar,Soumen das and Arun was a great guy. Only beca...,5,"1 Review , 1 Follower",5/24/2019 22:11,0,
4,Beyond Flavours,Dileep,Food is good.we ordered Kodi drumsticks and ba...,5,"3 Reviews , 2 Followers",5/24/2019 21:37,0,
...,...,...,...,...,...,...,...,...
9995,Chinese Pavilion,Abhishek Mahajan,Madhumathi Mahajan Well to start with nice cou...,3,"53 Reviews , 54 Followers",6/5/2016 0:08,0,
9996,Chinese Pavilion,Sharad Agrawal,This place has never disappointed us.. The foo...,4.5,"2 Reviews , 53 Followers",6/4/2016 22:01,0,
9997,Chinese Pavilion,Ramandeep,"Bad rating is mainly because of ""Chicken Bone ...",1.5,"65 Reviews , 423 Followers",6/3/2016 10:37,3,
9998,Chinese Pavilion,Nayana Shanbhag,I personally love and prefer Chinese Food. Had...,4,"13 Reviews , 144 Followers",5/31/2016 17:22,0,


In [19]:
df_rest=df_R.drop(["Reviewer","Restaurant","Rating","Metadata","Time","Pictures","7514"],axis=1)
print(df_rest)

                                      Review_restaurant
0     The ambience was good, food was quite good . h...
1     Ambience is too good for a pleasant evening. S...
2     A must try.. great food great ambience. Thnx f...
3     Soumen das and Arun was a great guy. Only beca...
4     Food is good.we ordered Kodi drumsticks and ba...
...                                                 ...
9995  Madhumathi Mahajan Well to start with nice cou...
9996  This place has never disappointed us.. The foo...
9997  Bad rating is mainly because of "Chicken Bone ...
9998  I personally love and prefer Chinese Food. Had...
9999  Checked in here to try some delicious chinese ...

[10000 rows x 1 columns]


In [20]:
rest_train,rest_test,=train_test_split(df_rest,test_size=1-(500/10000),random_state=42)
rest_train

Unnamed: 0,Review_restaurant
7186,We went here to celebrate a Friend’s birthday ...
9822,"Ordered Gobi paratha and aloo paratha,\nGettin..."
4492,Just now had a buffet lunch and right now in r...
1365,Good ambience\nNice outdoor seating\nGood danc...
9526,Pathetic quality of food!\nWaste of money and ...
...,...
5734,very less quantity.
5191,good
5390,Chicken laila biryani and Spicy chicken table ...
860,They have this amazing IPL offer\n600 for 4 pi...


In [21]:
rest_train.isna().any().any()
rest_train.dropna(inplace=True)
rest_train.isna().any().any()

False

In [22]:
import re
def clean_text(text):
    # Removing special characters and punctuation
    text = re.sub(r'[^A-Za-z0-9\s]', '', text)
    # Remove extra whitespaces
    text = ' '.join(text.split())
    return text

rest_train['cleaned_concated_500rows'] = rest_train['Review_restaurant'].apply(clean_text)
rest_train

Unnamed: 0,Review_restaurant,cleaned_concated_500rows
7186,We went here to celebrate a Friend’s birthday ...,We went here to celebrate a Friends birthday d...
9822,"Ordered Gobi paratha and aloo paratha,\nGettin...",Ordered Gobi paratha and aloo paratha Getting ...
4492,Just now had a buffet lunch and right now in r...,Just now had a buffet lunch and right now in r...
1365,Good ambience\nNice outdoor seating\nGood danc...,Good ambience Nice outdoor seating Good dancef...
9526,Pathetic quality of food!\nWaste of money and ...,Pathetic quality of food Waste of money and th...
...,...,...
5734,very less quantity.,very less quantity
5191,good,good
5390,Chicken laila biryani and Spicy chicken table ...,Chicken laila biryani and Spicy chicken table ...
860,They have this amazing IPL offer\n600 for 4 pi...,They have this amazing IPL offer 600 for 4 pin...


\**USING "EN_CORE_WEB_SM" FROM SPACY LIBRARY FOR PART OF SPEECH TAGGING OF MY MD-AE_DATA**

In [23]:
import spacy
import pandas as pd
# Download and install the 'en_core_web_sm' model
!python -m spacy download en_core_web_sm
# Load the spaCy model
nlp = spacy.load('en_core_web_sm')


def custom_tokenizer(text):
    doc = nlp(text)
    tokens = [token.text for token in doc]
    return tokens

# Assuming your DataFrame is named 'twitter_train'
rest_train['Tokens'] = rest_train['cleaned_concated_500rows'].apply(custom_tokenizer)
rest_train['POS_Tags'] =rest_train["cleaned_concated_500rows"].apply(lambda x: [(token.text, token.pos_) for token in nlp(x)])


2023-11-16 08:23:04.732655: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-16 08:23:04.732719: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-16 08:23:04.732761: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Collecting en-core-web-sm==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m38.0 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now l

**EXTRACTING THE FEATURES(NOUNS AND NOUN PHRASE)FOR EACH TWEET IN OUR MULTI-DOMAIN DATASET**

In [24]:
from collections import Counter
vocab = set()
vocab_size= len(vocab)
co_occurence_matrix=np.zeros((vocab_size, vocab_size))
#function to identify nouns and noun phrases based on POS tags
def extract_nouns(tokens, pos_tags):
    nouns = [tokens[i] for i in range(len(tokens)) if pos_tags[i] == 'NOUN']  # Assuming 'NOUN' represents nouns
    return nouns

# Initialized vocabulary for nouns and noun phrases
vocab = set()
# A list to store nouns and noun phrases for each tweet
nouns_per_tweet = []

for tweet in rest_train['cleaned_concated_500rows']:
    tokens = tweet.split()
    #part-of-speech tagging on the tokens using spaCy
    doc = nlp(' '.join(tokens))
    #nouns and noun phrases extraction from the tweet
    pos_tags = [token.pos_ for token in doc]
    nouns = extract_nouns(tokens, pos_tags)
    # populating the nouns_per_tweet variable with unique nouns
    nouns_per_tweet.append(nouns)
    # Updating the vocabulary
    vocab.update(nouns)

In [25]:
nouns_per_tweet

[['birthday',
  'dinner',
  'staffs',
  'service',
  'interiors',
  'ambience',
  'food',
  'food',
  'preparation',
  'time',
  'bit',
  'place',
  'office',
  'colleagues',
  'friends',
  'family'],
 ['paratha',
  'aloo',
  'paratha',
  'parathas',
  'items',
  'people',
  'surprise',
  'parathasbiggermore',
  'quality',
  'ones',
  'packaging',
  'add',
  'ons',
  'love',
  'parathas'],
 ['buffet',
  'lunch',
  'restaurant',
  'review',
  'option',
  'vegetarians',
  'starters',
  'mins',
  'course',
  'quality'],
 ['ambience',
  'outdoor',
  'dancefloor',
  'menu',
  'customer',
  'service',
  'chilling'],
 ['quality', 'food', 'money', 'disaster', 'dough', 'lumps'],
 ['branches',
  'branch',
  'place',
  'weekends',
  'friends',
  'family',
  'food',
  'times',
  'time',
  'food',
  'food',
  'budget'],
 ['1045pm',
  'lot',
  'place',
  'rotis',
  'tandoori',
  'papad',
  'Even',
  'when',
  'operating',
  'for',
  'Not',
  'ever'],
 ['price',
  'quantity',
  'taste',
  'Request',


In [26]:
vocab

{'cover',
 'saviour',
 'going',
 'After',
 'hint',
 'RESTAURANT',
 'on',
 'waitingzomato',
 'milk',
 'guidance',
 'spouses',
 'So',
 'played',
 'nights',
 'Need',
 'from',
 'pao',
 'paper',
 'manno1',
 'how',
 'pub',
 'addons',
 'reviews',
 'chilling',
 'discount',
 'goodfood',
 'system',
 'experience',
 'hangout',
 'weekendso',
 'out',
 'jelabi',
 'rare',
 'trend',
 'olio',
 'waffles',
 'daughter',
 'point',
 'denied',
 'curries',
 'CEO',
 'license',
 'the',
 'sensation',
 'dressing',
 'rajma',
 'sauces',
 'song',
 'barbeque',
 'goods',
 'roof',
 'la',
 'views',
 'juice',
 'soups',
 'cuisine',
 'Repeat',
 'that',
 'mark',
 'day',
 'tikka',
 'wowness',
 'things',
 'screen',
 'palak',
 'oyster',
 'curd',
 'converted',
 'occupied',
 'care',
 'making',
 'bone',
 'gachiwaliwe',
 'lettuce',
 'speed',
 'FROM',
 'flavour',
 'chat',
 'onlyitem',
 'amaazing',
 'Fish',
 'drink',
 'package',
 'mutton',
 'valet',
 'sitting',
 'coffee',
 'shakes',
 'triumph',
 'pickle',
 'spillproof',
 'biryanis',


**CO-OCCURENCE MATRIX FOR THE FEATURES(NOUNS AND NOUNPHRASE)**

In [27]:
co_occurrence_matrix = [[0 for _ in vocab] for _ in vocab]
#iterating through to update the co_occurence matrix
for nouns_in_tweet in nouns_per_tweet:
    for i, noun1 in enumerate(nouns_in_tweet):
        for j, noun2 in enumerate(nouns_in_tweet):
            if i != j:
                # Increasing the co-occurrence count for the pair (noun1, noun2)
                co_occurrence_matrix[list(vocab).index(noun1)][list(vocab).index(noun2)] += 1

# Converting  the co-occurrence matrix to a DataFrame for better visualization
import pandas as pd
co_occurrence_df = pd.DataFrame(co_occurrence_matrix, index=list(vocab), columns=list(vocab))

print(co_occurrence_df)



          cover  saviour  going  After  hint  RESTAURANT  on  waitingzomato  \
cover         0        0      0      0     0           0   0              0   
saviour       0        2      0      0     0           0   0              0   
going         0        0      0      0     0           0   0              0   
After         0        0      0      0     0           0   0              0   
hint          0        0      0      0     0           0   0              0   
...         ...      ...    ...    ...   ...         ...  ..            ...   
neck          0        2      0      0     0           0   0              0   
Ambiance      0        0      0      0     0           0   0              0   
well          0        0      0      0     0           0   0              0   
cultnery      0        0      0      0     0           0   0              0   
cider         0        0      0      0     0           0   0              0   

          milk  guidance  ...  pop  Meat  watering 

**FEATURE WEIGHTING**

In [28]:
import numpy as np
from sklearn.feature_extraction.text import TfidfTransformer
co_occurrence_matrix = co_occurrence_df.to_numpy()
co_occurrence_matrix_transposed = co_occurrence_df.T
tfidf_transformer = TfidfTransformer()
tfidf_matrix = tfidf_transformer.fit_transform(co_occurrence_matrix_transposed)
tfidf_matrix = tfidf_matrix.toarray()

In [29]:
tfidf_matrix

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.17077676, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

**FEATURE EXTRACTION (TOP FEATURE)**

In [30]:
# Calculating the sum of TF-IDF values for each feature (term)
feature_scores = np.sum(tfidf_matrix, axis=1)

# Getting the indices that would sort the features by their scores in descending order
sorted_indices = np.argsort(feature_scores, axis=0)[::-1]

# Defining the number of top features you want to select
top_n = 20

# Selecting the top-n features
top_features_indices = sorted_indices[:top_n]

# Getting the corresponding feature names or terms from your vocabulary
top_features = [list(vocab)[i] for i in top_features_indices]


In [31]:
top_features

['place',
 'food',
 'experience',
 'service',
 'ambience',
 'taste',
 'restaurant',
 'chicken',
 'dishes',
 'starters',
 'rice',
 'time',
 'friends',
 'lunch',
 'menu',
 'staff',
 'people',
 'course',
 'items',
 'and']

**BINARY FEATURE REPRESENTATION**

In [85]:
dataset = rest_train['cleaned_concated_500rows'].tolist()

# Initializing binary feature matrix
rest_binary_features = np.zeros((len(dataset), top_n), dtype=int)

for i, example in enumerate(dataset):
    tokens = example.split()

    for j, feature in enumerate(top_features):
        if feature in tokens:
            rest_binary_features[i, j] = 1


In [107]:
rest_binary_features

[[1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1],
 [0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1],
 [0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
 [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
 [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1],
 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1],
 [0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
 [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1],
 [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
 [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1],
 [1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1],
 [0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
 [1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [108]:
rest_text_data = rest_train["cleaned_concated_500rows"].tolist()
rest_binary_features = [list(map(int, item)) for item in rest_binary_features]
print(len(rest_text_data))


497


In [109]:
import pandas as pd

# Define a list of aspects you want to identify
aspects=["Ambience", "Service", "Food", "Cleanliness", "Speed of service", "Chicken Afgani", "Chicken 65 Biryani", "North Indian Thali", "South Indian Thali", "Chinese food", "Pizza and pasta", "Salami pineapple salad", "Blueberry cheese cake", "Chinese restaurant", "Paneer Makhani", "Mutton sheekh kebab", "Dance floor", "Drinks", "Music", "Delivery", "Milkshakes", "Biryani", "Burger", "Shakes", "Fries", "Thai chicken rice", "Dimsums", "Chicken dumplings", "Mushroom starter", "Pizza", "Ala carte lunch", "Service quality", "Pricing", "Breakfast", "Wings", "Customer support", "Gelato and avalanche", "Pancake breakfast combo", "Waiters", "Parking", "Cakes", "Tofu keema parantha", "Sunday Brunch", "Sporty ambience", "KitKat milkshake", "Dance floor and DJ", "Mocktails", "Games", "Casual night with family", "Waiters' behavior", "Packaging", "Customer experience", "Buffet", "Coffee and tea", "North Indian cuisine", "Music", "Chocoholics", "Veg Manchuria", "Limited spread", "Ice cream and shakes", "Chicken biryani", "Cheese instead of chicken", "Store outlet", "Asian, Indian, and continental food", "The Chocolate Room", "Veg and non-veg sides", "Spiciness", "Spiciness and masala", "Wing quality", "Buffet spread", "Ice cream and shakes", "Dance floor", "Mocktails", "Games", "Casual night with family", "Waiters' behavior", "Packaging", "Customer experience", "Buffet", "Coffee and tea", "North Indian cuisine", "Music", "Chocoholics", "Veg Manchuria", "Limited spread", "Ice cream and shakes", "Chicken biryani", "Cheese instead of chicken", "Store outlet", "Asian, Indian, and continental food", "The Chocolate Room", "Veg and non-veg sides", "Spiciness", "Spiciness and masala", "Wing quality", "Buffet spread", "Ice cream and shakes", "Dance floor", "Mocktails", "Games", "Casual night with family", "Waiters' behavior", "Packaging", "Customer experience", "Buffet", "Coffee and tea", "North Indian cuisine", "Music", "Chocoholics", "Veg Manchuria", "Limited spread", "Ice cream and shakes", "Chicken biryani", "Cheese instead of chicken", "Store outlet", "Asian, Indian, and continental food", "The Chocolate Room", "Veg and non-veg sides", "Spiciness", "Spiciness", "Ambience, food, and service", "Food and packing", "Service", "Wings and fries", "Burger and fried rice", "Egg fried rice", "Evening ambience", "Biryani", "Buffet and service", "Club ambience", "Bar Exchange", "Momos", "Biryani and food quality", "Cockroach in the food", "Spiciness", "Restaurant cleanliness", "Brew and hangout","service", "quality", "staff", "experience", "food", "ambiance", "location", "menu", "prices", "cleanliness", "atmosphere", "decor", "presentation", "taste", "variety", "reservation", "waiting time", "portion size", "drinks", "desserts", "specials", "recommendations", "noise level", "seating", "parking", "payment options", "allergies", "COVID-19 precautions", "online ordering", "delivery", "takeout", "reviews", "ratings", "loyalty program", "gift cards", "special events", "catering", "wifi", "kids-friendly", "pet-friendly", "vegetarian options", "vegan options", "gluten-free options", "ethnic cuisine", "local ingredients", "customer service", "waitstaff", "chef", "management", "hygiene", "health and safety measures","Shawarma", "Cost", "Fries", "Burger", "Wings","Curd and Raita", "Lemons", "Ambience", "Service", "Communication with Server", "Food Presentation", "Food Taste", "Seating Arrangement", "Service Staff", "Biryani Taste", "Food Quantity", "Food Quality", "Food Packaging", "Beverage Quality", "Ambience", "Hygiene", "Food Variety", "Service Speed", "Non-Vegetarian Options", "Buffet Options", "Ice Cream Range", "Packing Quality", "Cockroach in Meal", "Music", "Decor", "Soup Taste", "Pizza Taste", "Gachibowli Fried Chicken", "Grilled Chicken Breast", "Water Quality", "Food Blandness", "Food Smell","Food quality", "Food preparation time", "Delivery time", "Service", "Price", "Tandoori Family Platter", "Apollo Fish", "Jumbo Pack Biryani", "Ambience", "Desserts and coffee", "Taste", "Staff service", "Variety of food", "Ambiance", "Music", "Hospitality", "Tandoori Momos", "Gulaabi Chaap", "Buffet", "Food quality during Ramzan", "Ice Cream", "Donuts", "Music", "Outdoor seating", "Entry fee and cover charge", "DJ", "Mosquitos", "Chaat", "Biryani", "Budget-friendly", "Service","Food quality", "Choices", "Toppings", "Venue", "Space", "Corporate parties", "Dance floor", "Music", "Bass", "Staff"
]


# Create a dictionary to map aspects to integers
aspect_to_int = {aspect: idx for idx, aspect in enumerate(aspects)}

# Create a list to store aspect labels for each row
aspect_labels_list = []

# Function to find and tag aspects in a given text
def tag_aspects(text):
    text = text.lower()  # Convert to lowercase for case-insensitive matching
    labels = []
    for aspect in aspects:
        if aspect.lower() in text:
            labels.append(aspect_to_int[aspect])
    return labels

# Iterate through each row in the dataset and tag aspects
for index, row in rest_train.iterrows():
    text = row['cleaned_concated_500rows']  # Adjust the column name to match your dataset
    aspect_labels = tag_aspects(text)
    aspect_labels_list.append(aspect_labels)

# Print the aspect labels for each row
# for i, labels in enumerate(aspect_labels_list):
#     print(f"Row {i + 1} Aspects: {labels}")

# Now you can use the aspect_labels_list in your initial code


In [110]:
len(aspect_labels_list )

497

In [111]:
import numpy as np
from sklearn.model_selection import train_test_split

# Assuming you have your text_data, binary_features, and aspect_labels_list defined

# Create an array of indices to shuffle the data
indices = np.arange(len(rest_text_data))

# Randomly shuffle the indices
np.random.shuffle(indices)

# Determine the number of samples for training and validation
num_train_samples = int(0.7 * len(rest_text_data))
num_val_samples = len(rest_text_data) - num_train_samples

# Split the indices into training and validation sets
train_indices = indices[:num_train_samples]
val_indices = indices[num_train_samples:]

# Use the selected indices to split the data
rest_text_data_train = [rest_text_data[i] for i in train_indices]
rest_text_data_val = [rest_text_data[i] for i in val_indices]
rest_binary_features_train = [rest_binary_features[i] for i in train_indices]
rest_binary_features_val = [rest_binary_features[i] for i in val_indices]
rest_aspect_labels_train = [aspect_labels_list[i] for i in train_indices]
rest_aspect_labels_val = [aspect_labels_list[i] for i in val_indices]

# Now, text_data_train and aspect_labels_train will have the same number of samples as text_data_val and aspect_labels_val.




In [112]:
import numpy as np

# Determine the maximum length of your sequences from text_data_train
max_sequence_length = max(len(seq) for seq in rest_text_data_train)

# Initialize lists to store padded sequences
rest_padded_text_data_train = []
rest_padded_binary_features_train = []
rest_padded_aspect_labels_train = []

# Pad sequences to the maximum length
for text_seq, binary_seq, aspect_seq in zip(rest_text_data_train, rest_binary_features_train, rest_aspect_labels_train):
    # Ensure the sequences are iterable (e.g., lists, strings)
    text_seq = list(text_seq)
    binary_seq = list(binary_seq)
    aspect_seq = list(aspect_seq)

    text_padding = [0] * (max_sequence_length - len(text_seq))
    binary_padding = [0] * (max_sequence_length - len(binary_seq))
    aspect_padding = [0] * (max_sequence_length - len(aspect_seq))

    padded_text_seq = text_seq + text_padding
    padded_binary_seq = binary_seq + binary_padding
    padded_aspect_seq = aspect_seq + aspect_padding

    rest_padded_text_data_train.append(padded_text_seq)
    rest_padded_binary_features_train.append(padded_binary_seq)
    rest_padded_aspect_labels_train.append(padded_aspect_seq)

# Convert the lists to numpy arrays
# padded_text_data_train = np.array(padded_text_data_train)
rest_padded_binary_features_train = np.array(rest_padded_binary_features_train, dtype=np.float32)
rest_padded_aspect_labels_train = np.array(rest_padded_aspect_labels_train, dtype=np.float32)



# Determine the maximum length of your sequences
max_sequence_length = max(len(seq) for seq in rest_text_data_val)

# Initialize lists to store padded sequences
rest_padded_text_data_val = []
rest_padded_binary_features_val = []
rest_padded_aspect_labels_val = []

# Pad sequences to the maximum length
for text_seq, binary_seq, aspect_seq in zip(rest_text_data_val, rest_binary_features_val, rest_aspect_labels_val):
    # Ensure the sequences are iterable (e.g., lists, strings)
    text_seq = list(text_seq)
    binary_seq = list(binary_seq)
    aspect_seq = list(aspect_seq)

    text_padding = [0] * (max_sequence_length - len(text_seq))
    binary_padding = [0] * (max_sequence_length - len(binary_seq))
    aspect_padding = [0] * (max_sequence_length - len(aspect_seq))

    padded_text_seq = text_seq + text_padding
    padded_binary_seq = binary_seq + binary_padding
    padded_aspect_seq = aspect_seq + aspect_padding

    rest_padded_text_data_val.append(padded_text_seq)
    rest_padded_binary_features_val.append(padded_binary_seq)
    rest_padded_aspect_labels_val.append(padded_aspect_seq)

rest_padded_binary_features_val = np.array(rest_padded_binary_features_val, dtype=np.float32)
rest_padded_aspect_labels_val = np.array(rest_padded_aspect_labels_val, dtype=np.float32)

In [115]:
type(rest_padded_text_data_val)

list

In [114]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization
from transformers import BertTokenizer, TFBertModel
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam

max_sequence_length = 32  # Reduce the maximum sequence length
binary_feature_size = 347  # Replace with the actual size of your binary features
batch_size = 347 # Further reduce the batch size

# Assuming you have the text_data as a list
# Tokenize and pad your text_data
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
rest_text_data = tokenizer(rest_text_data, padding='max_length', truncation=True, max_length=max_sequence_length, return_tensors='tf')
rest_text_data_train = tokenizer(rest_text_data_train, padding='max_length', truncation=True, max_length=max_sequence_length, return_tensors='tf')

# Enable mixed precision
policy = tf.keras.mixed_precision.Policy('mixed_float16')
tf.keras.mixed_precision.set_global_policy(policy)

from tensorflow.keras.optimizers import Adam
# Define a custom learning rate
custom_learning_rate = 0.001

# Create an optimizer with the custom learning rate
custom_optimizer = Adam(learning_rate=custom_learning_rate)

# Load a smaller pre-trained BERT model
model_ATE = TFBertModel.from_pretrained("bert-base-uncased")

# Freeze the weights of the pre-trained BERT model to keep its learned representations
for layer in model_ATE.layers:
    layer.trainable = False

# Create a new model that combines text and binary features
text_input = Input(shape=(max_sequence_length,), dtype=tf.int32)
binary_features_input = Input(shape=(binary_feature_size,), dtype=tf.float32)  # Assuming binary features are floats

binary_features_input_normalized = BatchNormalization()(binary_features_input)

text_features = model_ATE(rest_text_data_train)  # Reuse the text feature extraction part of BERT

# Assuming text_features is a dictionary with 'last_hidden_state' key
text_features = text_features['last_hidden_state']

# Flatten the text_features (if needed)
text_features = tf.keras.layers.Flatten()(text_features)

text_features = Dropout(0.6)(text_features)

combined = tf.keras.layers.concatenate([text_features, binary_features_input])

num_classes = 1

aspect_extraction_output = Dense(num_classes, activation='sigmoid')(combined)

finetuned_model = tf.keras.Model(inputs=[text_input, binary_features_input], outputs=aspect_extraction_output)

# Compile the new model
finetuned_model.compile(optimizer=custom_optimizer, loss='binary_crossentropy', metrics=['binary_accuracy'])

early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

aspect_labels_list
# # Convert data to the appropriate format
rest_text_data_train = rest_text_data_train['input_ids'].numpy()
print(len(rest_text_data_train))
print(len(rest_text_data_train))
rest_padded_binary_features_train = np.array(rest_padded_binary_features_train, dtype=np.float32)
rest_padded_aspect_labels_train = np.array(rest_padded_aspect_labels_train, dtype=np.float32)

rest_padded_binary_features_val = np.array(rest_padded_binary_features_val, dtype=np.float32)
rest_padded_aspect_labels_val = np.array(rest_padded_aspect_labels_val, dtype=np.float32)

# Reshape binary features to match the expected shape
rest_padded_binary_features_train = rest_padded_binary_features_train[:, :binary_feature_size]
rest_padded_binary_features_val = rest_padded_binary_features_val[:, :binary_feature_size]
# Assuming your labels are currently of shape (None, 2705)
# Reshape the labels to have shape (batch_size, 1)
rest_padded_aspect_labels_train = rest_padded_aspect_labels_train.reshape(-1, 1)
rest_padded_aspect_labels_val = rest_padded_aspect_labels_val.reshape(-1, 1)



# Train the new model with text and binary features
finetuned_model.fit([rest_text_data_train[:347], rest_padded_binary_features_train[:347]],rest_padded_aspect_labels_train[:347], epochs=5, batch_size=batch_size,shuffle=True)


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

347
347
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7f33139f5120>

In [117]:
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import BertTokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Tokenize and pad your validation text_data
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Assuming rest_text_data_val is a list of strings
rest_text_data_val = [str(text) for text in rest_text_data_val]

# Pad sequences before tokenizing
max_sequence_length = 150
padded_sequences = pad_sequences(tokenizer(rest_text_data_val, truncation=True, padding='max_length', max_length=max_sequence_length, return_tensors='tf')['input_ids'], maxlen=max_sequence_length, padding='post', truncating='post', value=0)
print(type(padded_sequences))
# Now you can use it in the prediction
# Now you can use it in the prediction
predictions = finetuned_model.predict([padded_sequences[:, :32], rest_padded_binary_features_val[:150]])


# Threshold predictions to convert to binary values (0 or 1)
threshold = 0.5
binary_predictions = (predictions > threshold).astype(int)

# Flatten the labels and predictions if needed
flat_labels = np.array(rest_padded_aspect_labels_val).flatten()
flat_predictions = binary_predictions.flatten()

# Calculate evaluation metrics
precision = precision_score(flat_labels, flat_predictions)
recall = recall_score(flat_labels, flat_predictions)
f1 = f1_score(flat_labels, flat_predictions)

print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')



<class 'numpy.ndarray'>


InvalidArgumentError: ignored

In [None]:
len(input_ids_array)

3

In [None]:
len(rest_padded_binary_features_val)

150

**SAMSUNG_ELECTRONICS REVIEWS**

In [None]:
df_S = pd.read_csv("data/samsung_electronics.csv")
df_S

In [None]:
df_elect=df_S.drop(["Publisher","DateTime","Link","Category"],axis=1)
df_elect

In [None]:
elect_train,elect_test,=train_test_split(df_elect,test_size=1 - (150 / 51302), random_state=42)

In [None]:
len(elect_train)

150

**STARBUCK REVIEW**

In [None]:
df_star =pd.read_csv("data/starbuck_review.csv")
df_star

In [None]:
df_starb=df_star.drop(["name","location","Date","Rating","Image_Links"],axis=1)
df_starb

In [None]:
star_train,star_test,=train_test_split(df_starb,test_size=1 - (200 / 850), random_state=42)

In [None]:
len(star_train)

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Assuming you have validation data (text_data_val and binary_features_val)
# and validation labels (aspect_labels_val)

# Make predictions on the validation data
predictions = new_model.predict([rest_text_data_val, rest_binary_features_val])

# Convert predictions to binary labels (0 or 1)
binary_predictions = (predictions > 0.5).astype(int)

# Calculate precision, recall, and F1 score
precision = precision_score(aspect_labels_val, binary_predictions)
recall = recall_score(aspect_labels_val, binary_predictions)
f1 = f1_score(aspect_labels_val, binary_predictions)

print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)



ValueError: ignored

In [None]:
new_model.compile(optimizer=custom_optimizer, loss=tf.keras.BinaryCrossentropy(from_logits=True), metrics=[tf.keras.metrics.BinaryAccuracy()])


In [None]:
text_input.shape

TensorShape([None, 32])

In [None]:
binary_features_input.shape

TensorShape([None, 32])