In [18]:
import pandas as pd

df = pd.read_csv('../../data/data.csv')


len(df)

188378

In [2]:
df.columns.tolist()

['conversation_id', 'message', 'sentiment']

In [3]:
null_values = df.isnull().sum()
null_values

conversation_id    0
message            5
sentiment          0
dtype: int64

## **Dropping NaN rows below** ##

- NULL messages cannot be classified with sentiment

In [4]:
# Extract rows with null values
rows_with_nulls = df[df.isnull().any(axis=1)]
rows_with_nulls


Unnamed: 0,conversation_id,message,sentiment
20124,923,,Disgusted
28552,1306,,Happy
31504,1442,,Surprised
83361,3815,,Curious to dive deeper
95303,4361,,Happy


In [5]:
df = df.dropna()
df.isnull().sum()

conversation_id    0
message            0
sentiment          0
dtype: int64

In [6]:
sentiments = df['sentiment'].value_counts()
sentiments

sentiment
Curious to dive deeper    80887
Neutral                   41367
Surprised                 30637
Happy                     29615
Sad                        2533
Disgusted                  1432
Fearful                    1026
Angry                       876
Name: count, dtype: int64

In [7]:
output_classes = df['sentiment'].unique().tolist()
output_classes

['Curious to dive deeper',
 'Happy',
 'Neutral',
 'Surprised',
 'Disgusted',
 'Sad',
 'Fearful',
 'Angry']

## **Transformers**

### **Roberta**

In [8]:
df['message'][0]

'Are you a fan of Google or Microsoft?'

In [9]:
df['sentiment'][0]

'Curious to dive deeper'

In [10]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

tokenizer = AutoTokenizer.from_pretrained("SamLowe/roberta-base-go_emotions")
model = AutoModelForSequenceClassification.from_pretrained("SamLowe/roberta-base-go_emotions")

classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
res = classifier(df['message'][0])

print(res)

  from .autonotebook import tqdm as notebook_tqdm


[{'label': 'curiosity', 'score': 0.667454719543457}]


In [11]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification

model_name = "SamLowe/roberta-base-go_emotions"
tokenizer = RobertaTokenizer.from_pretrained(model_name)
model = RobertaForSequenceClassification.from_pretrained(model_name)

labels_mapping = model.config.id2label
output_classes = df['sentiment'].unique().tolist()

target_idx = [7, 17, 27, 26, 11, 25, 14, 2]

target_mapping = {k: v for k, v in zip(target_idx, output_classes)}


In [12]:
target_idx = [7, 17, 27, 26, 11, 25, 14, 2]

target_mapping = {k: v for k, v in zip(target_idx, output_classes)}
for k, v in target_mapping.items():
    print(f'{k}: {v}')


7: Curious to dive deeper
17: Happy
27: Neutral
26: Surprised
11: Disgusted
25: Sad
14: Fearful
2: Angry


In [13]:
# import torch

# def predict(text):
#     inputs = tokenizer(text, return_tensors="pt")
#     outputs = model(**inputs)
#     logits = outputs.logits
#     filtered_logits = logits[:, target_idx]
#     probabilities = torch.softmax(filtered_logits, dim=1)
#     predicted_class_idx = torch.argmax(filtered_logits, dim=1).item()
#     return output_classes[predicted_class_idx], probabilities


# text = "I am unhappy"
# predicted_label, probabilities = predict(text)[0], predict(text)[1]
# print(f"Predicted Label: {predicted_label}")

# for sentiment, probability in zip(output_classes, probabilities.tolist()[0]):
#     print(f"{sentiment}: {probability:.4f}")

In [14]:
ground_truth = df['sentiment'][0]

In [15]:
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification

model_name = "SamLowe/roberta-base-go_emotions"
tokenizer = RobertaTokenizer.from_pretrained(model_name)
model = RobertaForSequenceClassification.from_pretrained(model_name)

def predict(text):
    inputs = tokenizer(text, return_tensors="pt")
    outputs = model(**inputs)
    logits = outputs.logits
    filtered_logits = logits[:, target_idx]
    predicted_class_idx = torch.argmax(filtered_logits, dim=1).item()
    return output_classes[predicted_class_idx]


text = "I am unhappy"
predicted_label = predict(text)
print(f"Predicted Label: {predicted_label}")

Predicted Label: Sad


In [16]:
output_classes

['Curious to dive deeper',
 'Happy',
 'Neutral',
 'Surprised',
 'Disgusted',
 'Sad',
 'Fearful',
 'Angry']

In [19]:
import pandas as pd
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

model_name = "SamLowe/roberta-base-go_emotions"
tokenizer = RobertaTokenizer.from_pretrained(model_name)
model = RobertaForSequenceClassification.from_pretrained(model_name)

output_classes = ['Curious to dive deeper', 'Happy', 'Neutral', 'Surprised', 'Disgusted', 'Sad', 'Fearful', 'Angry']

df = pd.read_csv('../../data/data.csv')
df = df.dropna()
output_classes = df['sentiment'].unique().tolist()


labels_mapping = model.config.id2label
# output_classes = df['sentiment'].unique().tolist()

target_idx = [7, 17, 27, 26, 11, 25, 14, 2]

target_mapping = {k: v for k, v in zip(target_idx, output_classes)}

# Function to perform inference and map to custom classes
def predict(text):
    inputs = tokenizer(text, return_tensors="pt")
    outputs = model(**inputs)
    logits = outputs.logits
    filtered_logits = logits[:, target_idx]
    predicted_class_idx = torch.argmax(filtered_logits, dim=1).item()
    return output_classes[predicted_class_idx]

# Apply the predict function to each message
df['predicted_sentiment'] = df['message'].apply(predict)

# Calculate the confusion matrix
y_true = df['sentiment']
y_pred = df['predicted_sentiment']

# Generate the confusion matrix
cm = confusion_matrix(y_true, y_pred, labels=output_classes)

# Plot the confusion matrix
# plt.figure(figsize=(10, 8))
# sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=output_classes, yticklabels=output_classes)
# plt.xlabel('Predicted')
# plt.ylabel('Actual')
# plt.title('Confusion Matrix')
# plt.show()

# Print the classification report
print(classification_report(y_true, y_pred, target_names=output_classes))


KeyboardInterrupt: 