In [1]:
import pandas as pd
import numpy as np
import string
import re
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
import torch
from transformers import BertTokenizer, BertForSequenceClassification


In [2]:
# Load the dataset
data = pd.read_csv('../Oriserve- Intern Data Scientist Assignment/Evaluation-dataset.csv', header=None)

sentiment_array = ['Col_'+ str(i) for i in range(1,15)]

new_col = ['Sentence'] + sentiment_array

data.columns = new_col

data.columns

Index(['Sentence', 'Col_1', 'Col_2', 'Col_3', 'Col_4', 'Col_5', 'Col_6',
       'Col_7', 'Col_8', 'Col_9', 'Col_10', 'Col_11', 'Col_12', 'Col_13',
       'Col_14'],
      dtype='object')

In [3]:
# Count frequencies of sentiment labels
sent = {}
for col in data.columns:
    if col != 'Sentence':
        z = data[col].dropna()
        for label in z:
            sent[label] = sent.get(label, 0) + 1

In [4]:
# Select sentiment labels with frequency greater than 30
list_1 = [key for key, value in sent.items() if value > 30]

In [5]:
# Modify data to include only selected sentiment labels
def dataModification(data, selected_labels):
    data1 = data.iloc[:, 1:]
    data2 = pd.DataFrame(index=range(0, data1.shape[0]), columns=selected_labels)
    for i, row in data1.iterrows():
        subthemes = row.dropna().tolist()
        for label in subthemes:
            if label in selected_labels:
                data2.loc[i][label] = 1
    data2 = data2.fillna(0)
    final_data = pd.DataFrame()
    final_data['Sentence'] = data['Sentence']
    for column in data2.columns:
        final_data[column] = data2[column]
    return final_data

In [6]:
final_data = dataModification(data, list_1)

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  data2.loc[i][label] = 1
  data2 = data2.fillna(0)


In [7]:
# Text preprocessing
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\d+', '', text)  # Remove digits
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    return text

In [8]:
final_data['Sent_Processed'] = final_data['Sentence'].apply(preprocess_text)

In [9]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(final_data['Sent_Processed'], final_data[list_1], test_size=0.2)# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(final_data['Sent_Processed'], final_data[list_1], test_size=0.2)

In [10]:
# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(list_1))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
# Tokenize input text
def tokenize_text(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=64)
    return inputs

In [12]:
X_train_tokenized = X_train.apply(tokenize_text)
X_test_tokenized = X_test.apply(tokenize_text)

In [13]:
import numpy as np

# Filter out non-numeric values from y_train
y_train_numeric = np.array([float(label) for label in y_train if isinstance(label, str) and label.replace('.', '').isdigit()])

# Train the BERT model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
loss_fn = torch.nn.BCEWithLogitsLoss()

for epoch in range(3):  # Train for 3 epochs, you can adjust this
    model.train()
    for inputs, label in zip(X_train_tokenized, y_train_numeric):  # Use y_train_numeric
        inputs = {key: torch.tensor(val).to(device) for key, val in inputs.items()}
        label = torch.tensor(label, dtype=torch.float).unsqueeze(0).to(device)
        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = loss_fn(outputs.logits, label)
        loss.backward()
        optimizer.step()


In [14]:
# Evaluate the BERT model
model.eval()
y_pred_probs = []
for inputs in X_test_tokenized:
    inputs = {key: torch.tensor(val).to(device) for key, val in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    y_pred_probs.append(outputs.logits.cpu().numpy())

y_pred_probs = np.array(y_pred_probs)
y_pred_class = (y_pred_probs > 0.5).astype(int)


  inputs = {key: torch.tensor(val).to(device) for key, val in inputs.items()}


In [17]:
type(y_pred_class)

numpy.ndarray

In [18]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, classification_report

# Convert y_pred_probs to a single NumPy array
y_pred_probs = np.vstack(y_pred_probs)

# If necessary, convert probabilities to binary predictions (assuming a threshold of 0.5)
y_pred_class = (y_pred_probs > 0.5).astype(int)

# Ensure y_pred_class has the same shape as y_test.values
if y_pred_class.shape != y_test.values.shape:
    raise ValueError("Shape mismatch: y_pred_class and y_test.values must have the same shape.")

# Evaluate performance
ac = accuracy_score(y_test.values, y_pred_class)
f1 = f1_score(y_test.values, y_pred_class, average='micro')
print('Accuracy Score:', ac)
print('F1-Score:', f1)
print(classification_report(y_test.values, y_pred_class))


Accuracy Score: 0.006413418845584608
F1-Score: 0.007655502392344498
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       406
           1       0.00      0.00      0.00       930
           2       0.00      0.00      0.00       254
           3       0.00      0.00      0.00       233
           4       0.00      0.00      0.00       130
           5       0.00      0.00      0.00        31
           6       0.00      0.00      0.00        84
           7       0.00      0.00      0.00        88
           8       0.00      0.00      0.00        21
           9       0.00      0.00      0.00        93
          10       0.00      0.00      0.00        53
          11       0.00      0.00      0.00        27
          12       0.00      0.00      0.00        23
          13       0.00      0.00      0.00        22
          14       0.00      0.00      0.00        53
          15       0.00      0.00      0.00        33
          16 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
