In [162]:
import torch
import torch.nn as nn
from transformers import BertTokenizer, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split

# Assuming you have your own data for fine-tuning
# Example: sequences and labels

labels = [0, 1, ...]  # Assuming binary classification, modify accordingly



In [163]:
import pandas as pd


In [164]:
df = pd.read_csv("/content/train.csv")

In [165]:
print(df.head())

                                                post  \
0  Post: my recent stress has come from school. i...   
1  Post: i have been stressed about finding a way...   
2  Post: i have been stressed out by my daughter ...   
3  Post: People talking shit about my friends ups...   
4  Post: i am not sure how i feel about my new jo...   

                                          question  \
0   Question: This post shows the stress cause of    
1   Question: This post shows the stress cause of    
2   Question: This post shows the stress cause of    
3   Question: This post shows the stress cause of    
4   Question: This post shows the stress cause of    

                                            response  
0  school. Reasoning: The post explicitly mention...  
1  financial problem. Reasoning: The post mention...  
2  family issues. Reasoning: The post mentions th...  
3  social relationships. Reasoning: The post expl...  
4  work. Reasoning: The post mentions the poster'...  


In [166]:
df['response'] = df['response'].str.replace('Reasoning: ', '')

# Print the modified DataFrame
print(df)

                                                   post  \
0     Post: my recent stress has come from school. i...   
1     Post: i have been stressed about finding a way...   
2     Post: i have been stressed out by my daughter ...   
3     Post: People talking shit about my friends ups...   
4     Post: i am not sure how i feel about my new jo...   
...                                                 ...   
5542  Post: i haven't been happy with us, and its on...   
5543               Post: Stress has made me irrational.   
5544  Post: im so stressed out, i just dont have tim...   
5545             Post: my friends wont be there for me.   
5546  Post: The face I heard before Your head trip's...   

                                             question  \
0      Question: This post shows the stress cause of    
1      Question: This post shows the stress cause of    
2      Question: This post shows the stress cause of    
3      Question: This post shows the stress cause of    
4     

In [168]:
def func(output_an):
    """
    Maps the content of 'output_an' to a numerical label based on specific keywords.

    Parameters:
    - output_an (str): The input string to be classified.

    Returns:
    - int: Numerical label corresponding to the category.
    """
    output_lower = output_an.lower()

    if 'school' in output_lower:
        return 0
    elif 'financial' in output_lower:
        return 1
    elif 'family' in output_lower:
        return 2
    elif 'social' in output_lower:
        return 3
    elif 'work' in output_lower:
        return 4
    elif 'health' in output_lower:
        return 5
    elif 'emotional' in output_lower:
        return 6
    elif 'decision' in output_lower:
        return 7
    elif 'other' in output_lower:
        return 8
    else:
        return -1  # or any other value to indicate no match

# Example usage:



In [167]:
sequences = df['response'].to_list()


In [169]:
labels=[]

In [170]:
for i in sequences:
  labels.append(func(i))

In [172]:
import torch
import torch.nn as nn
from transformers import BertModel

class BertForSequenceClassification(nn.Module):
    def __init__(self, num_classes=2):
        super(BertForSequenceClassification, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(768, num_classes)

    def forward(self, input_ids, attention_mask, token_type_ids):
        outputs = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits

# Example usage
model = BertForSequenceClassification(num_classes=9)


In [173]:
# Tokenize input sequences
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenized_inputs = tokenizer(sequences, padding=True, truncation=True, return_tensors='pt')

# Create PyTorch dataset
dataset = TensorDataset(tokenized_inputs['input_ids'], tokenized_inputs['attention_mask'], tokenized_inputs['token_type_ids'], torch.tensor(labels))

# Split the dataset into training and validation sets
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

# DataLoader for training and validation sets
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=False)

# Instantiate the model

# Set up optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
total_steps = len(train_dataloader) * 3  # 3 epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Loss function
criterion = nn.CrossEntropyLoss()

# Training loop
epochs = 3
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in train_dataloader:
        input_ids, attention_mask, token_type_ids, labels = batch
        input_ids, attention_mask, token_type_ids, labels = input_ids.to(device), attention_mask.to(device), token_type_ids.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        loss = criterion(outputs, labels)
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        scheduler.step()

    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}/{epochs}, Training Loss: {avg_loss}")

# Validation loop
model.eval()
val_loss = 0
correct = 0
total = 0

with torch.no_grad():
    for batch in val_dataloader:
        input_ids, attention_mask, token_type_ids, labels = batch
        input_ids, attention_mask, token_type_ids, labels = input_ids.to(device), attention_mask.to(device), token_type_ids.to(device), labels.to(device)

        outputs = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        loss = criterion(outputs, labels)
        val_loss += loss.item()

        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

avg_val_loss = val_loss / len(val_dataloader)
val_accuracy = correct / total

print(f"Validation Loss: {avg_val_loss}, Validation Accuracy: {val_accuracy}")


Epoch 1/3, Training Loss: 0.3694627137432585
Epoch 2/3, Training Loss: 0.031169368021136564
Epoch 3/3, Training Loss: 0.019980016995301778
Validation Loss: 0.014200728725908059, Validation Accuracy: 0.9981981981981982


In [174]:
val_df = pd.read_csv('/content/val.csv')  # Replace '/content/val.csv' with the actual path


In [175]:
val_df['response'] = val_df['response'].str.replace('Reasoning: ', '')

# Print the modified DataFrame
print(val_df)

                                                  post  \
0    Post: the bad thing is that i have to go to sc...   
1    Post: anyways i am so sick of the constant dra...   
2    Post: My baby is sick and I feel bad that he i...   
3    Post: I always feel guilty when my family star...   
4    Post: Its made my relationships with people ve...   
..                                                 ...   
611  Post: I get the feeling that no matter where I...   
612  Post: I'm having a hard time getting a job. Th...   
613  Post: What and how am I going to do? Seriously...   
614           Post: I am very worried about my mother.   
615  Post: my job. it has 0 application for the shi...   

                                            question  \
0     Question: This post shows the stress cause of    
1     Question: This post shows the stress cause of    
2     Question: This post shows the stress cause of    
3     Question: This post shows the stress cause of    
4     Question: This po

In [186]:
sequences = val_df['response'].to_list()
labels=[]
for i in sequences:
    labels.append(func(i))

In [187]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer
from transformers import BertForSequenceClassification
from sklearn.metrics import classification_report

# Load the validation dataset
  # Replace with the actual path to your validation dataset
df_val = val_df

# Load the fine-tuned model

# Set the device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Tokenize the validation data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_data(data, max_length=128):
    inputs = tokenizer(
        data,
        add_special_tokens=True,
        return_tensors="pt",
        max_length=max_length,
        padding="max_length",
        truncation=True
    )
    return inputs

X_val = tokenize_data(df_val['response'].values.tolist())

# Prepare DataLoader
val_dataset = TensorDataset(X_val['input_ids'], X_val['attention_mask'], X_val['token_type_ids'])
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

# Evaluate the model
model.eval()
all_predictions = []

with torch.no_grad():
    for batch in val_loader:
        inputs = {'input_ids': batch[0].to(device),
                  'attention_mask': batch[1].to(device),
                  'token_type_ids': batch[2].to(device)}

        outputs = model(**inputs)
        probabilities = torch.nn.functional.softmax(outputs, dim=-1)
        predictions = torch.argmax(probabilities, dim=1)
        all_predictions.extend(predictions.cpu().numpy())
# Map predictions to labels
print(all_predictions)
print(labels)
# Map true labels to numerical values using the map_output_label function
true_labels = labels

# Print classification report
print(classification_report(true_labels, all_predictions))


[0, 3, 5, 2, 3, 1, 1, 1, 4, 3, 3, 2, 7, 1, 0, 2, 1, 2, 1, 0, 1, 1, 1, 3, 4, 4, 1, 0, 4, 3, 2, 4, 4, 1, 0, 4, 3, 8, 4, 2, 4, 0, 1, 6, 2, 1, 0, 2, 3, 5, 7, 0, 1, 5, 4, 0, 1, 7, 4, 6, 4, 2, 2, 4, 0, 2, 2, 2, 4, 0, 4, 3, 4, 5, 3, 3, 0, 4, 0, 5, 4, 8, 6, 3, 0, 1, 0, 8, 1, 0, 3, 1, 5, 0, 3, 8, 0, 4, 4, 5, 1, 7, 1, 0, 3, 8, 4, 5, 5, 0, 0, 2, 7, 6, 0, 0, 6, 2, 0, 6, 0, 0, 2, 4, 4, 7, 0, 0, 4, 2, 5, 4, 5, 0, 6, 0, 1, 1, 2, 0, 7, 3, 2, 3, 2, 8, 1, 1, 1, 3, 1, 0, 0, 4, 5, 7, 0, 4, 0, 0, 0, 2, 1, 3, 2, 0, 0, 3, 4, 0, 3, 0, 1, 3, 6, 0, 4, 5, 3, 0, 1, 6, 4, 4, 3, 3, 0, 3, 4, 3, 5, 2, 3, 4, 4, 1, 8, 1, 5, 1, 4, 1, 0, 2, 1, 2, 2, 2, 5, 1, 1, 4, 4, 7, 1, 2, 0, 2, 7, 6, 3, 0, 0, 1, 7, 2, 7, 1, 4, 4, 5, 6, 4, 1, 6, 3, 2, 1, 4, 1, 0, 3, 2, 0, 4, 4, 1, 0, 0, 1, 4, 1, 0, 4, 0, 0, 1, 6, 2, 0, 4, 3, 0, 1, 4, 4, 4, 0, 0, 0, 0, 5, 4, 5, 0, 5, 5, 4, 0, 2, 2, 4, 1, 4, 1, 0, 1, 6, 3, 3, 3, 0, 1, 1, 4, 3, 0, 0, 5, 8, 1, 4, 1, 2, 1, 3, 1, 7, 5, 0, 2, 0, 3, 4, 2, 1, 1, 7, 4, 3, 7, 1, 0, 0, 6, 8, 7, 2, 4, 4, 1, 2, 5, 

In [None]:
from sklearn.metrics import accuracy_score, f1_score

# Assuming 'val_labels' contains the true labels for the validation set
accuracy = accuracy_score(labels, all_preds)
f1 = f1_score(labels, all_preds, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score (weighted): {f1:.4f}")