<a href="https://colab.research.google.com/github/RobyRoshna/Insensitive-Lang-Detection/blob/Augmentation/Copy_of_Augmented.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##**1. Imports** <a name="imports"></a>

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from huggingface_hub import login
import wandb
from transformers import Trainer, TrainingArguments, BertTokenizer, BertForSequenceClassification, set_seed
import pandas as pd
import random
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np
import pickle
!pip install optuna

In [None]:
#Note: random_state for splits need to set separately for this study its 42 throughout
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed(42)
set_seed(42)

### Custom dataset and metrics class




In [None]:
# Custom Dataset Class for Tokenized Data
class SentenceDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):

        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        """
        Returns the total number of samples in the dataset.

        Returns:
            int: The number of samples in the dataset.
        """
        return len(self.labels)

    def __getitem__(self, idx):
        """
        Retrieves the tokenized inputs and the corresponding label for the given index.

        Args:
            idx: Index of the data sample.

        Returns:
            A dictionary containing the tokenized inputs (input IDs, attention masks, etc.)
            and the label for the specified index.
        """
        # Convert tokenized data for the index to PyTorch tensors
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        # Ensure the label key is 'labels'
        item['labels'] = torch.tensor(self.labels[idx])  # Add the corresponding label
        return item

# Function to compute metrics
def compute_metrics(pred):
    predictions, labels = pred
    preds = predictions.argmax(axis=1)
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

## Pure GPT on unaug

In [None]:
import pandas as pd
import torch
import wandb
import seaborn as sns
import matplotlib.pyplot as plt
from transformers import BertForSequenceClassification, BertTokenizer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

test_dataexp = pd.read_csv("/content/drive/MyDrive/Honours MiscData(Roshna)/test_dataOriginal.csv")

# Load the saved model
model_path = "/content/drive/MyDrive/Honours MiscData(Roshna)/ChatGPTbert_base_model"
modelAug = BertForSequenceClassification.from_pretrained(model_path)
modelAug.eval()

print("\nLoaded Model Path:", model_path)

wandb.finish()
wandb.init(project="Insensitive Lang Detection", entity="Roshna", name="Bert_baseOripredGPT")

tokenizer = BertTokenizer.from_pretrained(model_path)
inputs = tokenizer(list(test_dataexp['Sentence']), padding=True, truncation=True, return_tensors="pt")


with torch.no_grad():
    outputs = modelAug(**inputs)
    confidences = torch.softmax(outputs.logits, dim=1)

test_predictions = outputs.logits.argmax(dim=1).cpu().numpy()
test_true_labels = test_dataexp['Manual_Annotation'].values


accuracy = accuracy_score(test_true_labels, test_predictions)
class_report = classification_report(test_true_labels, test_predictions, output_dict=True)
conf_matrix = confusion_matrix(test_true_labels, test_predictions)

# Extract Precision, Recall, and F1-score - macro average gives equal importance to both classes in binary classification
precision = class_report["macro avg"]["precision"]
recall = class_report["macro avg"]["recall"]
f1 = class_report["macro avg"]["f1-score"]

wandb.log({
    "test_accuracy": accuracy,
    "test_precision": precision,
    "test_recall": recall,
    "test_f1": f1
})


print("\nTest Set Evaluation")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

# Confusion Matrix
conf_matrix_df = pd.DataFrame(conf_matrix,
                              columns=["Predicted NotInsensitive", "Predicted Insensitive"],
                              index=["Actual NotInsensitive", "Actual Insensitive"])

print("\nConfusion Matrix - Test Set:")
print(conf_matrix_df)

wandb.log({"confusion_matrix": wandb.Table(dataframe=conf_matrix_df)})

# Analyze false positives & false negatives
false_positives = test_dataexp[(test_true_labels == 0) & (test_predictions == 1)].copy()
false_negatives = test_dataexp[(test_true_labels == 1) & (test_predictions == 0)].copy()

print("\nFalse Positives:")
print(false_positives[['Sentence', 'Manual_Annotation']])

print("\nFalse Negatives:")
print(false_negatives[['Sentence', 'Manual_Annotation']])

# Confidence scores
test_dataexp['Confidence_Positive'] = confidences[:, 1].cpu().numpy()
test_dataexp['Confidence_Negative'] = confidences[:, 0].cpu().numpy()

# Assign confidence values to misclassified examples
false_positives['Confidence_Positive'] = test_dataexp.loc[false_positives.index, 'Confidence_Positive'].values
false_negatives['Confidence_Negative'] = test_dataexp.loc[false_negatives.index, 'Confidence_Negative'].values

print("\nFalse Positives with Confidence:")
print(false_positives[['Sentence', 'Manual_Annotation', 'Confidence_Positive']])

print("\nFalse Negatives with Confidence:")
print(false_negatives[['Sentence', 'Manual_Annotation', 'Confidence_Negative']])

plt.figure(figsize=(4, 4))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues",
            xticklabels=["NotInsensitive", "Insensitive"],
            yticklabels=["NotInsensitive", "Insensitive"])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.savefig("OriginalTestset.png")
plt.show()

# Finish WandB logging
wandb.finish()


## **2. Pure generated Data** <a name="gendata"></a>

In [None]:

# The augmentated dataset
file_path = '/content/drive/MyDrive/Honours MiscData(Roshna)/ChatGPT_generated.xlsx'
data = pd.read_excel(file_path)

# cleaning data
data = data[['Sentence', 'Label','Term']]
data = data.dropna()

# 1 for insensitive and 0 for notInsensitive
data['Label'] = data['Label'].apply(lambda x: 1 if x.lower() == 'insensitive' else 0)

#train, validation, and test sets
train_data, temp_data = train_test_split(data, test_size=0.2, stratify=data['Label'],random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, stratify=temp_data['Label'],random_state=42)

print(f"Train size: {len(train_data)}, Validation size: {len(val_data)}, Test size: {len(test_data)}")
# Save splits
train_data.to_csv("/content/drive/MyDrive/Honours MiscData(Roshna)/ChatGPTtrain_data.csv", index=False)
val_data.to_csv("/content/drive/MyDrive/Honours MiscData(Roshna)/ChatGPTval_data.csv", index=False)
test_data.to_csv("/content/drive/MyDrive/Honours MiscData(Roshna)/ChatGPTtest_data.csv", index=False)

In [None]:
train_data=pd.read_csv("/content/drive/MyDrive/Honours MiscData(Roshna)/ChatGPTtrain_data.csv")
val_data=pd.read_csv("/content/drive/MyDrive/Honours MiscData(Roshna)/ChatGPTval_data.csv")
test_data=pd.read_csv("/content/drive/MyDrive/Honours MiscData(Roshna)/ChatGPTtest_data.csv")
print(f"Duplicates in train data: {train_data.duplicated(subset=['Sentence', 'Label']).sum()}")
print(f"Duplicates in validation data: {val_data.duplicated(subset=['Sentence', 'Label']).sum()}")
print(f"Duplicates in test data: {test_data.duplicated(subset=['Sentence', 'Label']).sum()}")


### Tokenizer

In [None]:
# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Function to tokenize data
def tokenize_data(data, tokenizer, max_length=109):
    return tokenizer(
        list(data['Sentence']),
        padding=True,
        truncation=True,         # Truncate longer sentences
        max_length=max_length,   # Max token length
        return_tensors='pt'      # Return PyTorch tensors
    )

train_labelsChatGPT = list(train_data['Label'])
val_labelsChatGPT = list(val_data['Label'])
test_labelsChatGPT = list(test_data['Label'])

# Tokenize the data
train_encodingsChatGPT = tokenize_data(train_data, tokenizer)
val_encodingsChatGPT = tokenize_data(val_data, tokenizer)
test_encodingsChatGPT = tokenize_data(test_data, tokenizer)


# Save tokenized data
np.save("/content/drive/MyDrive/Honours MiscData(Roshna)/ChatGPTtrain_encodings_input_ids.npy", train_encodingsChatGPT['input_ids'].numpy())
np.save("/content/drive/MyDrive/Honours MiscData(Roshna)/ChatGPTtrain_encodings_attention_mask.npy", train_encodingsChatGPT['attention_mask'].numpy())
np.save("/content/drive/MyDrive/Honours MiscData(Roshna)/ChatGPTtrain_labels.npy", np.array(train_labelsChatGPT))


In [None]:

# Create datasets for train, validation, and test sets
train_datasetChatGPT = SentenceDataset(train_encodingsChatGPT, train_labelsChatGPT)
val_datasetChatGPT = SentenceDataset(val_encodingsChatGPT, val_labelsChatGPT)
test_datasetChatGPT = SentenceDataset(test_encodingsChatGPT, test_labelsChatGPT)


# Save the train, val, and test datasets
with open("/content/drive/MyDrive/Honours MiscData(Roshna)/ChatGPTtrain_dataset.pkl", "wb") as f:
    pickle.dump(train_datasetChatGPT, f)
with open("/content/drive/MyDrive/Honours MiscData(Roshna)/ChatGPTval_dataset.pkl", "wb") as f:
    pickle.dump(val_datasetChatGPT, f)
with open("/content/drive/MyDrive/Honours MiscData(Roshna)/ChatGPTtest_dataset.pkl", "wb") as f:
    pickle.dump(test_datasetChatGPT, f)


In [None]:
#Examples from the training dataset
for i in range(5):
    item = train_datasetChatGPT[i]
    print("Input IDs:", item['input_ids'])
    print("Attention Mask:", item['attention_mask'])
    print("Label:", item['labels'])  # 0 for Not Insensitive, 1 for Insensitive


### Model Training

In [None]:
# Load pre-trained BERT for binary classification
modelBbaseChatGPT = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

In [None]:
print(modelBbaseChatGPT.config)


In [None]:

wandb.finish()

# Initialize WandB
wandb.init(project="Insensitive Lang Detecton", entity="Roshna", name="Bert_baseChatGPT")

# TrainingArguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    logging_dir='./logs',
    logging_steps=10,
    report_to=["wandb"],  # WandB is used for logging
    run_name="Bert_baseChatGPT"  # the run name for this Trainer
)

# Trainer
trainer = Trainer(
    model=modelBbaseChatGPT,
    args=training_args,
    train_dataset=train_datasetChatGPT,
    eval_dataset=val_datasetChatGPT,
    compute_metrics=compute_metrics,
)

# Train
trainer.train()

results = trainer.evaluate(test_datasetChatGPT, metric_key_prefix="test")
wandb.log(results)

wandb.finish()  # Close the evaluation session


In [None]:
# Save the trained model and tokenizer
save_directory = "/content/drive/MyDrive/Honours MiscData(Roshna)/ChatGPTbert_base_model"
trainer.save_model(save_directory)  # Save model
tokenizer.save_pretrained(save_directory)  # Save tokenizer

### Model Results

In [None]:
wandb.init(project="Insensitive Lang Detecton", entity="Roshna", name="Bert_baseGPT")

# Extract predictions and true labels
test_predictions = test_results.predictions.argmax(axis=1)  # Get predicted classes
test_true_labels = test_results.label_ids

# Evaluate the predictions using classification_report
print("Classification Report on Test Data:")
from sklearn.metrics import classification_report
print(classification_report(test_true_labels, test_predictions, target_names=["notInsensitive", "Insensitive"]))

# Save predictions along with the test data
test_data['Predicted_Label'] = test_predictions
test_data.to_csv("/content/drive/MyDrive/Honours MiscData(Roshna)/ChatGPT_basetest_predictions.csv", index=False)
print("Predictions saved to '/content/drive/MyDrive/Honours MiscData(Roshna)/ChatGPT_basetest_predictions.csv'")

# Get predictions on the training data using the same method
train_results = trainer.predict(train_datasetChatGPT)

# Extract training predictions and labels
train_predictions = train_results.predictions.argmax(axis=1)
train_true_labels = train_results.label_ids

# Evaluate the predictions on the training data
print("Classification Report on Training Data:")
print(classification_report(train_true_labels, train_predictions, target_names=["notInsensitive", "Insensitive"]))

# Save training predictions
train_data['Predicted_Label'] = train_predictions
train_data.to_csv("/content/drive/MyDrive/Honours MiscData(Roshna)/ChatGPT_basetrain_predictions.csv", index=False)
print("Training predictions saved to '/content/drive/MyDrive/Honours MiscData(Roshna)/ChatGPT_basetrain_predictions.csv'")


In [None]:
import pandas as pd
import torch
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt

# Load test predictions and true labels
test_data = pd.read_csv("/content/drive/MyDrive/Honours MiscData(Roshna)/ChatGPT_basetest_predictions.csv")
true_labels = test_data['Label']
predicted_labels = test_data['Predicted_Label']
ChatGPT_train_data = pd.read_csv("/content/drive/MyDrive/Honours MiscData(Roshna)/ChatGPT_basetrain_predictions.csv")
ChatGPT_train_true_labels = ChatGPT_train_data['Label']
ChatGPT_train_predicted_labels = ChatGPT_train_data['Predicted_Label']

# **1. Compare Train, Validation, and Test Metrics**
train_metrics = {
    "accuracy": accuracy_score(ChatGPT_train_true_labels, ChatGPT_train_predicted_labels),
    "classification_report": classification_report(
        ChatGPT_train_true_labels, ChatGPT_train_predicted_labels, target_names=["NotInsensitive", "Insensitive"]
    )}

val_metrics = {"accuracy": 0.97674, "f1": 0.97717, "precision": 0.96396, "recall": 0.99074}
test_metrics = {
    "accuracy": accuracy_score(true_labels, predicted_labels),
    "classification_report": classification_report(true_labels, predicted_labels, target_names=["NotInsensitive", "Insensitive"]),
}


# Training Metrics
train_df = pd.DataFrame({
    "Metric": ["Accuracy"],
    "Value": [train_metrics["accuracy"]],
})
print("\nTraining Metrics:")
print(train_df)
print("\nClassification Report:\n", train_metrics["classification_report"])

# Validation Metrics
val_df = pd.DataFrame(val_metrics.items(), columns=["Metric", "Value"])
print("\nValidation Metrics:")
print(val_df)

# Test Metrics
test_df = pd.DataFrame({
    "Metric": ["Accuracy"],
    "Value": [test_metrics["accuracy"]],
})
print("\nTest Metrics:")
print(test_df)
print("\nClassification Report:\n", test_metrics["classification_report"])


epochs = [1, 2, 3]
train_losses = [0.212700, 0.050800, 0.007500]  # Training losses for epochs 1, 2, and 3 check training results above
val_losses = [0.064114, 0.074954, 0.106145]

plt.plot(epochs, train_losses, label="Train Loss", marker='o')
plt.plot(epochs, val_losses, label="Validation Loss", marker='o')
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Loss Trends")
plt.legend()
plt.savefig("PureGenloss_trends.png")
plt.show()

# Confusion Matrix
conf_matrix = confusion_matrix(true_labels, predicted_labels)
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=["NotInsensitive", "Insensitive"], yticklabels=["NotInsensitive", "Insensitive"])
plt.title("Confusion Matrix - Test Set")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.savefig("PureGenconfusion_matrix.png")
plt.show()

# Analyze false positives and false negatives
false_positives = test_data[(true_labels == 0) & (predicted_labels == 1)]
false_negatives = test_data[(true_labels == 1) & (predicted_labels == 0)]

print("False Positives:")
print(false_positives[['Sentence', 'Term']])

print("\nFalse Negatives:")
print(false_negatives[['Sentence', 'Term']])


model_path = "/content/drive/MyDrive/Honours MiscData(Roshna)/ChatGPTbert_base_model"
model = BertForSequenceClassification.from_pretrained(model_path)
model.eval()


from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained(model_path)
inputs = tokenizer(list(test_data['Sentence']), padding=True, truncation=True, return_tensors="pt")

# model outputs
with torch.no_grad():
    outputs = model(**inputs)
    confidences = torch.softmax(outputs.logits, dim=1)  # Get probabilities


test_data['Confidence_Positive'] = confidences[:, 1].cpu().numpy()
test_data['Confidence_Negative'] = confidences[:, 0].cpu().numpy()

false_positives['Confidence_Positive'] = test_data.loc[false_positives.index, 'Confidence_Positive']
false_negatives['Confidence_Negative'] = test_data.loc[false_negatives.index, 'Confidence_Negative']

print("False Positives with Confidence:")
print(false_positives[['Sentence', 'Term', 'Confidence_Positive']])

print("\nFalse Negatives with Confidence:")
print(false_negatives[['Sentence', 'Term', 'Confidence_Negative']])


## **3. Augmented Data Original + GPT** <a name="augdata"></a>

### Augmentation Process - removing test set before augmentation

In [None]:
import pandas as pd

# File paths
file1 = "/content/drive/MyDrive/Honours MiscData(Roshna)/deduplicated_annotations.xlsx"
file2 = "/content/drive/MyDrive/Honours MiscData(Roshna)/test_dataOriginal.xlsx"
output_file1 = "/content/drive/MyDrive/Honours MiscData(Roshna)/AbstractswithoutTestset.xlsx"

df1 = pd.read_excel(file1)
df2 = pd.read_excel(file2)

# Ensure the 'Sentence' column exists in both files
if 'Sentence' not in df1.columns or 'Sentence' not in df2.columns:
    raise ValueError("One of the files does not contain a 'Sentence' column.")

# Convert 'Sentence' columns to strings for consistency
df1['Sentence'] = df1['Sentence'].astype(str)
df2['Sentence'] = df2['Sentence'].astype(str)

# Identify sentences in File1 that are also in File2
common_sentences = df1['Sentence'][df1['Sentence'].isin(df2['Sentence'])]

# Remove sentences in File1 that are also in File2
df1_no_common = df1[~df1['Sentence'].isin(df2['Sentence'])]

# Count the number of sentences removed
removed_count = len(common_sentences)

# Save the modified dataframe to a new file
df1_no_common.to_excel(output_file1, index=False)

# Print confirmation and count of removed sentences
print(f"New file with sentences removed from File1 saved to: {output_file1}")
print(f"Number of sentences removed from File1: {removed_count}")

# Optional: Print the first few rows of the modified file
print("\nFirst 5 rows of File1 without common sentences:")
print(df1_no_common.head())


### V3 augmented

In [None]:
import pandas as pd


original_file_path = '/content/drive/MyDrive/Honours MiscData(Roshna)/AbstractswithoutTestset.xlsx'
generated_file_path = '/content/drive/MyDrive/Honours MiscData(Roshna)/ChatGPT_generated.xlsx'

original_data = pd.read_excel(original_file_path)
generated_data = pd.read_excel(generated_file_path)

original_data = original_data.rename(columns={"Matched_Terms": "Term", "Manual_Annotation": "Label"})
generated_data = generated_data.rename(columns={"Sentence": "Sentence", "Term": "Term", "Label": "Label"})

original_data['Label'] = original_data['Label'].apply(lambda x: 1 if x.lower() == "insensitive" else 0)
generated_data['Label'] = generated_data['Label'].apply(lambda x: 1 if x.lower() == "insensitive" else 0)

original_data['Source'] = 'Original'
generated_data['Source'] = 'ChatGPT'

# Ensure generated data has all required columns
if 'Prompt' not in generated_data.columns:
    generated_data['Prompt'] = None

# Initialize augmented dataset
augmented_data = []

# Get unique terms from the lexicon
terms = list(set(original_data['Term'].dropna().unique()) | set(generated_data['Term'].dropna().unique()))

# Define the required number of sentences per term and label
required_count = 25

# Augment data for each term
for term in terms:
    term_data = []

    # Filter original and generated datasets for this term
    original_term_data = original_data[original_data['Term'] == term]
    generated_term_data = generated_data[generated_data['Term'] == term]

    # Select insensitive sentences
    insensitive_original = original_term_data[original_term_data['Label'] == 1].sample(
        n=min(required_count, len(original_term_data[original_term_data['Label'] == 1])),
        random_state=42
    )
    insensitive_generated = generated_term_data[generated_term_data['Label'] == 1].sample(
        n=max(0, required_count - len(insensitive_original)),
        random_state=42
    )

    # Select notInsensitive sentences
    not_insensitive_original = original_term_data[original_term_data['Label'] == 0].sample(
        n=min(required_count, len(original_term_data[original_term_data['Label'] == 0])),
        random_state=42
    )
    not_insensitive_generated = generated_term_data[generated_term_data['Label'] == 0].sample(
        n=max(0, required_count - len(not_insensitive_original)),
        random_state=42
    )

    # Combine selected data
    term_data.extend(insensitive_original.to_dict(orient='records'))
    term_data.extend(insensitive_generated.to_dict(orient='records'))
    term_data.extend(not_insensitive_original.to_dict(orient='records'))
    term_data.extend(not_insensitive_generated.to_dict(orient='records'))
    augmented_data.extend(term_data)

augmented_df = pd.DataFrame(augmented_data)

#only the required columns
required_columns = ['Title', 'Sentence', 'Term', 'Label', 'Source_File', 'Source', 'Prompt']
augmented_df = augmented_df[required_columns]

# Save the augmented dataset
output_file_path = '/content/drive/MyDrive/Honours MiscData(Roshna)/25Augmented_annotationsV2.csv'
augmented_df.to_csv(output_file_path, index=False)

print(f"Augmented dataset saved to {output_file_path}")


### Augmentated data Analysis

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# data
path = '/content/drive/MyDrive/Honours MiscData(Roshna)/25Augmented_annotationsV2.csv'
data = pd.read_csv(path)
term_summary = (
    data.groupby(['Term', 'Source', 'Label'])
    .size()
    .unstack(fill_value=0)
    .reset_index()
)

# reverse alphabetical order
terms = sorted(term_summary['Term'].unique(), reverse=True)

insensitive_original = []
insensitive_gpt = []
not_insensitive_original = []
not_insensitive_gpt = []

for term in terms:
    term_data = term_summary[term_summary['Term'] == term]

    # Filling missing values 0
    original = term_data[term_data['Source'] == 'Original']
    gpt = term_data[term_data['Source'] == 'ChatGPT']

    insensitive_original.append(original[1].values[0] if not original.empty else 0)
    not_insensitive_original.append(original[0].values[0] if not original.empty else 0)
    insensitive_gpt.append(gpt[1].values[0] if not gpt.empty else 0)
    not_insensitive_gpt.append(gpt[0].values[0] if not gpt.empty else 0)


y = [i * 3.5 for i in range(len(terms))]  #spacing between terms
height = 1.2

plt.figure(figsize=(15, 25))
plt.barh(
    [p - height / 2 for p in y],
    insensitive_original,
    height=height,
    label="Insensitive (Source)",
    color='#e69f00',
    edgecolor='black'
)
plt.barh(
    [p - height / 2 for p in y],
    insensitive_gpt,
    height=height,
    left=insensitive_original,
    label="Insensitive (Synthetic)",
    color='#003f5c',
    edgecolor='black'
)

plt.barh(
    [p + height / 2 for p in y],
    not_insensitive_original,
    height=height,
    label="Not Insensitive (Source)",
    color='#d1495b',
    edgecolor='black',
    alpha=0.9
)
plt.barh(
    [p + height / 2 for p in y],
    not_insensitive_gpt,
    height=height,
    left=not_insensitive_original,
    label="Not Insensitive (Synthetic)",
    color='#00876c',
    edgecolor='black',
    alpha=0.9
)

plt.xlim(0, max(insensitive_original + insensitive_gpt + not_insensitive_original + not_insensitive_gpt))
plt.xlabel("Number of Sentences", fontsize=14)
plt.ylabel("Terms", fontsize=14)
plt.yticks(ticks=y, labels=terms, fontsize=12)  # terms on the y-axis

max_x = plt.xlim()[1]
for x in range(0, 25 + 1, 5):
    plt.axvline(x, color='gray', linestyle='--', alpha=0.5)

plt.legend(
    fontsize= 11,
    ncol=4,
    loc='upper left'
)
# Remove top and right border (spines)
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.savefig('distribution_by_term_horizontaln.png', dpi=300, bbox_inches='tight')
plt.show()


In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# data
path = '/content/drive/MyDrive/Honours MiscData(Roshna)/25Augmented_annotationsV2.csv'
data = pd.read_csv(path)
term_summary = (
    data.groupby(['Term', 'Source', 'Label'])
    .size()
    .unstack(fill_value=0)
    .reset_index()
)

# reverse alphabetical order
terms = sorted(term_summary['Term'].unique(), reverse=True)

insensitive_original = []
insensitive_gpt = []
not_insensitive_original = []
not_insensitive_gpt = []

for term in terms:
    term_data = term_summary[term_summary['Term'] == term]

    # Filling missing values with 0
    original = term_data[term_data['Source'] == 'Original']
    gpt = term_data[term_data['Source'] == 'ChatGPT']

    insensitive_original.append(original[1].values[0] if not original.empty else 0)
    not_insensitive_original.append(original[0].values[0] if not original.empty else 0)
    insensitive_gpt.append(gpt[1].values[0] if not gpt.empty else 0)
    not_insensitive_gpt.append(gpt[0].values[0] if not gpt.empty else 0)

# --- Synthetic Percentage Calculation ---
total_synthetic = sum(insensitive_gpt) + sum(not_insensitive_gpt)
total_original = sum(insensitive_original) + sum(not_insensitive_original)
total_all = total_synthetic + total_original

synthetic_percentage = (total_synthetic / total_all) * 100
print(f"Synthetic sentences: {total_synthetic} ({synthetic_percentage:.2f}% of total)")
print(f"Original sentences: {total_original} ({100 - synthetic_percentage:.2f}% of total)")
# ----------------------------------------

# Plotting
y = [i * 3.5 for i in range(len(terms))]  # spacing between terms
height = 1.2

plt.figure(figsize=(15, 25))
plt.barh(
    [p - height / 2 for p in y],
    insensitive_original,
    height=height,
    label="Insensitive (Source)",
    color='#e69f00',
    edgecolor='black'
)
plt.barh(
    [p - height / 2 for p in y],
    insensitive_gpt,
    height=height,
    left=insensitive_original,
    label="Insensitive (Synthetic)",
    color='#003f5c',
    edgecolor='black'
)

plt.barh(
    [p + height / 2 for p in y],
    not_insensitive_original,
    height=height,
    label="Not Insensitive (Source)",
    color='#d1495b',
    edgecolor='black',
    alpha=0.9
)
plt.barh(
    [p + height / 2 for p in y],
    not_insensitive_gpt,
    height=height,
    left=not_insensitive_original,
    label="Not Insensitive (Synthetic)",
    color='#00876c',
    edgecolor='black',
    alpha=0.9
)

plt.xlim(0, max(insensitive_original + insensitive_gpt + not_insensitive_original + not_insensitive_gpt))
plt.xlabel("Number of Sentences", fontsize=14)
plt.ylabel("Terms", fontsize=14)
plt.yticks(ticks=y, labels=terms, fontsize=12)

max_x = plt.xlim()[1]
for x in range(0, 25 + 1, 5):
    plt.axvline(x, color='gray', linestyle='--', alpha=0.5)

plt.legend(
    fontsize=11,
    ncol=4,
    loc='upper left'
)

# Remove top and right borders
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

# Save plot
plt.savefig('distribution_by_term_horizontaln.png', dpi=300, bbox_inches='tight')
plt.show()
