In [4]:
import pandas as pd
df=pd.read_csv("/kaggle/input/train-smaple-with-code/train-sample_with_code.csv")

In [2]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_scheduler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Check GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Preprocess DataFrame
df[['Tag1', 'Tag2', 'Tag3', 'Tag4', 'Tag5']] = df[['Tag1', 'Tag2', 'Tag3', 'Tag4', 'Tag5']].fillna('')
df['Tags_combined'] = df[['Tag1', 'Tag2', 'Tag3', 'Tag4', 'Tag5']].apply(lambda x: ' '.join(x), axis=1)
df = df[['Title', 'BodyMarkdown', 'Tags_combined', 'OpenStatus']].dropna()
df['OpenStatus'] = df['OpenStatus'].map(lambda x: 1 if x == "open" else 0)
df["text"] = df.apply(lambda row: f"Title: {row['Title']} Body: {row['BodyMarkdown']} Tags: {row['Tags_combined']}", axis=1)

# Train-test split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["text"].tolist(), df["OpenStatus"].tolist(), test_size=0.2, random_state=42)

# Tokenization
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)

# Custom Dataset
class SOFDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

train_dataset = SOFDataset(train_encodings, train_labels)
val_dataset = SOFDataset(val_encodings, val_labels)

# DataLoaders (batch size increased for multi-GPU use)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

# Load Pretrained BERT Model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs")
    model = nn.DataParallel(model)  # Wrap for multi-GPU
model = model.to(device)

# Optimizer, Scheduler, Loss
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=1e-4)
num_training_steps = len(train_loader) * 3
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)
criterion = nn.CrossEntropyLoss()

# Training Loop with Early Stopping
best_val_loss = float("inf")
patience = 3
counter = 0
num_epochs = 3

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        optimizer.zero_grad()
        outputs = model(**batch)
        loss = outputs.loss.mean()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_loader)

    # Validation
    model.eval()
    val_loss = 0
    val_preds, val_true = [], []
    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = criterion(outputs.logits, batch["labels"])
            val_loss += loss.item()
            preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
            val_preds.extend(preds)
            val_true.extend(batch["labels"].cpu().numpy())

    avg_val_loss = val_loss / len(val_loader)
    val_acc = accuracy_score(val_true, val_preds)
    print(f"Epoch {epoch+1}: Train Loss = {avg_train_loss:.4f}, Val Loss = {avg_val_loss:.4f}, Val Acc = {val_acc:.4f}")

    # Early Stopping Checkpoint
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        counter = 0
        torch.save(model.state_dict(), "best_fine_tuned_bert.pth")
    else:
        counter += 1
        if counter >= patience:
            print("Early stopping triggered! Loading best model...")
            model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
            if torch.cuda.device_count() > 1:
                model = nn.DataParallel(model)
            model.load_state_dict(torch.load("/kaggle/working/best_fine_tuned_bert.pth"))
            model = model.to(device)
            break

# Final Evaluation
model.eval()
y_true, y_pred = [], []
with torch.no_grad():
    for batch in val_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
        y_pred.extend(preds)
        y_true.extend(batch["labels"].cpu().numpy())

# Print Metrics
print("Fine-Tuned BERT Classification Report:")
print(classification_report(y_true, y_pred))

Using device: cuda


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using 2 GPUs




Epoch 1: Train Loss = 0.4756, Val Loss = 0.4601, Val Acc = 0.7836




Epoch 2: Train Loss = 0.3918, Val Loss = 0.4534, Val Acc = 0.7958




Epoch 3: Train Loss = 0.2962, Val Loss = 0.5181, Val Acc = 0.7910




Fine-Tuned BERT Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.80      0.79     14117
           1       0.79      0.78      0.79     13938

    accuracy                           0.79     28055
   macro avg       0.79      0.79      0.79     28055
weighted avg       0.79      0.79      0.79     28055



In [4]:
# Convert validation texts and original label lists back to a DataFrame
val_df = pd.DataFrame({
    "text": val_texts,
    "True_Label": y_true,
    "Predicted_Label": y_pred
})

# Re-split text back into Title, BodyMarkdown, Tags_combined for CSV output
val_df[["Title", "BodyMarkdown", "Tags_combined"]] = val_df["text"].str.extract(
    r"Title: (.*?) Body: (.*?) Tags: (.*)", expand=True)

# Filter misclassifications
misclassified_df = val_df[val_df["True_Label"] != val_df["Predicted_Label"]]

# Save misclassifications to CSV
misclassified_df.to_csv("misclassified_finetuned_bert.csv", index=False)
print("Misclassified examples saved to 'misclassified_finetuned_bert.csv'")


Misclassified examples saved to 'misclassified_finetuned_bert.csv'


In [8]:
import re

# Split the 'text' column into Title, BodyMarkdown, Tags_combined safely
def extract_parts(text):
    match = re.match(r"Title: (.*?) Body: (.*?) Tags: (.*)", text, flags=re.DOTALL)
    if match:
        return match.groups()
    else:
        return ("", "", "")

# Apply extraction
val_df[["Title", "BodyMarkdown", "Tags_combined"]] = val_df["text"].apply(lambda x: pd.Series(extract_parts(x)))


In [10]:
misclassified_df = val_df[val_df["True_Label"] != val_df["Predicted_Label"]]

# Save misclassifications to CSV
misclassified_df.to_csv("misclassified_finetuned_bert.csv", index=False)
print("Misclassified examples saved to 'misclassified_finetuned_bert.csv'")

Misclassified examples saved to 'misclassified_finetuned_bert.csv'


In [11]:
len(misclassified_df)

5863

In [15]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import torch.nn.functional as F
import warnings
from transformers import logging as hf_logging

# Suppress warnings
hf_logging.set_verbosity_error()
warnings.filterwarnings("ignore", category=UserWarning)

# Load tokenizer and model
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Load fine-tuned weights
state_dict = torch.load("/kaggle/working/best_fine_tuned_bert.pth", map_location=torch.device('cpu'))

# Fix DataParallel prefixes if any
new_state_dict = {k.replace('module.', ''): v for k, v in state_dict.items()}
model.load_state_dict(new_state_dict)
model.eval()

# Function to predict
def predict(title, body, tags):
    combined_input = f"{title} [SEP] {body} [SEP] {tags}"
    inputs = tokenizer(combined_input, return_tensors="pt", padding=True, truncation=True, max_length=512)
    
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        probs = F.softmax(logits, dim=1)
        prediction = torch.argmax(probs, dim=1).item()
    
    return prediction, probs.squeeze().tolist()

# User input
title_input = input("Enter the Title: ")
body_input = input("Enter the BodyMarkdown: ")
tags_input = input("Enter the Tags (space separated): ")

# Run prediction
label, probabilities = predict(title_input, body_input, tags_input)

# Output
print(f"\n🔮 Predicted Label: {label} ({'Closed' if label == 0 else 'Open'})")
print(f"📊 Probabilities: {probabilities}")


  state_dict = torch.load("/kaggle/working/best_fine_tuned_bert.pth", map_location=torch.device('cpu'))


Enter the Title:  stop ajax function in midway when other element is clicked
Enter the BodyMarkdown:  Hi have a page which has List data.   When I click on an element of the list, an Ajax function is called which populates some date in the right column.  Now wat's working:-  i click on element -> i see the loading.gif a few seconds -> data gets loaded.  When i click on other element midway, the first function completes and only then the second request is taken.  What I want:-  when i click and the ajax is loading, I click again in between, the previous function should stop, and my new request should be taken. 
Enter the Tags (space separated):  jquery ajax



🔮 Predicted Label: 1 (Open)
📊 Probabilities: [0.3762441873550415, 0.6237558722496033]


In [5]:
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from torch.nn import DataParallel
from tqdm import tqdm

# Check if multiple GPUs are available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpus = torch.cuda.device_count()
print(f"Using device: {device} with {n_gpus} GPUs")

# Load Pretrained BERT Tokenizer and Model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')
bert_model = DataParallel(bert_model)  # Wrap for multi-GPU
bert_model.to(device)
bert_model.eval()  # Disable dropout etc.

# Clean and prepare data
df[['Tag1', 'Tag2', 'Tag3', 'Tag4', 'Tag5']] = df[['Tag1', 'Tag2', 'Tag3', 'Tag4', 'Tag5']].fillna('')
df['Tags_combined'] = df[['Tag1', 'Tag2', 'Tag3', 'Tag4', 'Tag5']].apply(lambda x: ' '.join(x), axis=1)
df = df[['Title', 'BodyMarkdown', 'CodeSnippets', 'Tags_combined', 'OpenStatus']].dropna()
df["BiStatus"] = df["OpenStatus"].map(lambda x: 1 if x == "open" else 0)

# Function to batch process embeddings
def get_bert_embeddings(texts, batch_size=8):
    embeddings = []

    for i in tqdm(range(0, len(texts), batch_size), desc="Extracting BERT embeddings"):
        batch = texts[i:i + batch_size]
        inputs = tokenizer(batch.tolist(), padding=True, truncation=True, max_length=512, return_tensors="pt").to(device)
        
        with torch.no_grad():
            outputs = bert_model(**inputs)
        
        cls_embeddings = outputs.last_hidden_state[:, 0, :].detach().cpu().numpy()
        embeddings.append(cls_embeddings)

    return np.vstack(embeddings)

# Batch embedding extraction for each feature
title_embs = get_bert_embeddings(df['Title'])
body_embs = get_bert_embeddings(df['BodyMarkdown'])
code_embs = get_bert_embeddings(df['CodeSnippets'])
tags_embs = get_bert_embeddings(df['Tags_combined'])

# Concatenate all embeddings
X = np.hstack([title_embs, body_embs, code_embs, tags_embs])
y = df["BiStatus"].values

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Logistic Regression
lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X_train, y_train)

# Predictions
y_pred_lr = lr_model.predict(X_test)

# Evaluate
print("Logistic Regression (BERT) Classification Report:")
print(classification_report(y_test, y_pred_lr))


Using device: cuda with 2 GPUs


Extracting BERT embeddings: 100%|██████████| 12000/12000 [12:46<00:00, 15.66it/s]
Extracting BERT embeddings: 100%|██████████| 12000/12000 [37:08<00:00,  5.38it/s]
Extracting BERT embeddings: 100%|██████████| 12000/12000 [38:04<00:00,  5.25it/s]
Extracting BERT embeddings: 100%|██████████| 12000/12000 [12:29<00:00, 16.00it/s]


Logistic Regression (BERT) Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.68      0.71      7998
           1       0.78      0.83      0.81     11201

    accuracy                           0.77     19199
   macro avg       0.76      0.75      0.76     19199
weighted avg       0.77      0.77      0.77     19199



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [6]:
# Identify misclassified indices
misclassified_indices = np.where(y_pred_lr != y_test)[0]

# Convert test split to DataFrame
# First, create a temporary test dataframe using the index from train_test_split
X_test_indices = df.iloc[X_train.shape[0]:].index  # assumes order is preserved

# Now collect misclassified rows using index from original df
misclassified_rows = []

for idx in misclassified_indices:
    row_idx = X_test_indices[idx]
    row = df.loc[row_idx]

    misclassified_rows.append({
        "Title": row["Title"],
        "BodyMarkdown": row["BodyMarkdown"],
        "CodeSnippets": row["CodeSnippets"],
        "Tags_combined": row["Tags_combined"],
        "TrueLabel": row["OpenStatus"],
        "PredictedLabel": int(y_pred_lr[idx])
    })

# Create DataFrame from misclassified rows
misclassified_df = pd.DataFrame(misclassified_rows)

# Save to CSV
misclassified_df.to_csv("bert_lr_misclassified_binary.csv", index=False)
print("Misclassified samples saved to 'bert_lr_misclassified.csv'")

Misclassified samples saved to 'bert_lr_misclassified.csv'


In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


unique_labels = df["OpenStatus"].unique()
label_mapping = {label: idx for idx, label in enumerate(unique_labels)}
reverse_label_mapping = {v: k for k, v in label_mapping.items()}

# Map labels to numeric form
df["OpenStatusLabel"] = df["OpenStatus"].map(label_mapping)


X = np.hstack([title_embs, body_embs, code_embs, tags_embs])  # Shape: (num_samples, embedding_size * 4)
y = df["OpenStatusLabel"].values

# ====== TRAIN-TEST SPLIT ======
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ====== MODEL TRAINING ======
print("Training Logistic Regression for multi-class...")
lr_model = LogisticRegression(max_iter=1000, random_state=42, multi_class='multinomial')
lr_model.fit(X_train, y_train)

# ====== EVALUATION ======
y_pred_lr = lr_model.predict(X_test)
print("Logistic Regression (Multiclass BERT) Classification Report:")
print(classification_report(y_test, y_pred_lr, target_names=[reverse_label_mapping[i] for i in sorted(reverse_label_mapping)]))

# ====== MISCLASSIFIED SAMPLES ======

misclassified_indices = np.where(y_pred_lr != y_test)[0]
print(f"Total misclassified samples: {len(misclassified_indices)}")

# Find indices of X_test in original df (preserving row alignment)
X_test_indices = df.iloc[X_train.shape[0]:].index

misclassified_rows = []
for idx in misclassified_indices:
    row_idx = X_test_indices[idx]
    row = df.loc[row_idx]
    misclassified_rows.append({
        "Title": row["Title"],
        "BodyMarkdown": row["BodyMarkdown"],
        "CodeSnippets": row["CodeSnippets"],
        "Tags_combined": row["Tags_combined"],
        "TrueLabel": reverse_label_mapping[row["OpenStatusLabel"]],
        "PredictedLabel": reverse_label_mapping[y_pred_lr[idx]]
    })

misclassified_df = pd.DataFrame(misclassified_rows)
misclassified_df.to_csv("bertpretained_multiclass_misclassified.csv", index=False)
print("Saved misclassified samples to 'bert_multiclass_misclassified.csv'")


Using device: cuda
Training Logistic Regression for multi-class...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression (Multiclass BERT) Classification Report:
                     precision    recall  f1-score   support

               open       0.75      0.88      0.81     11201
      too localized       0.28      0.08      0.13      1019
          off topic       0.54      0.44      0.48      1920
not a real question       0.51      0.41      0.46      3367
   not constructive       0.56      0.51      0.53      1692

           accuracy                           0.68     19199
          macro avg       0.53      0.46      0.48     19199
       weighted avg       0.65      0.68      0.65     19199

Total misclassified samples: 6194
Saved misclassified samples to 'bert_multiclass_misclassified.csv'
