In [13]:
import pandas as pd

In [14]:
df = pd.read_excel('/content/Deposit - Unified_output.xlsx')

In [15]:
df=df[['Scenario Description','Function','Sub-Function','Feature']]

In [16]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import MultiLabelBinarizer
from torch.optim import AdamW
from sklearn.metrics import accuracy_score
import numpy as np

# Load your DataFrame (df) here

# Extract features and labels from DataFrame
X = df['Scenario Description']
X = X.astype('str')
y = df[['Function', 'Sub-Function', 'Feature']]

# Ensure all possible classes are represented in the training data
all_classes = set(y['Function']).union(set(y['Sub-Function'])).union(set(y['Feature']))
y_combined = y.apply(lambda x: tuple(all_classes & set(x)), axis=1)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_combined, test_size=0.2, random_state=42)

# Encode labels using MultiLabelBinarizer with all possible classes
label_encoder = MultiLabelBinarizer(classes=list(all_classes))
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Define DistilBERT model and tokenizer
model_name = 'distilbert-base-uncased'
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
num_labels = len(all_classes)
model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

# Tokenize data and create PyTorch Dataset
class ScenarioFunctionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.encodings = tokenizer(
            texts.tolist(),
            padding=True,
            truncation=True,
            max_length=256  # Adjust based on your specific needs and resources
        )
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float32)
        return item

# Create datasets using encoded labels
train_dataset = ScenarioFunctionDataset(X_train, y_train_encoded, tokenizer)
test_dataset = ScenarioFunctionDataset(X_test, y_test_encoded, tokenizer)

# DataLoaders for training and testing
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)  # Adjust batch size for DistilBERT
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)  # Adjust batch size for DistilBERT

# Set up optimizer using torch.optim.AdamW with customized learning rate
optimizer = AdamW(model.parameters(), lr=5e-5)  # Recommended learning rate for DistilBERT

# Fine-tune DistilBERT model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

num_epochs = 1  # Adjust based on your dataset and resources
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    average_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}: Average training loss = {average_train_loss}")

# Evaluation on test set
model.eval()
all_predictions = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = (logits > 0.0).float()  # Use threshold for multi-label classification

        all_predictions.append(predictions.cpu().numpy())
        all_labels.append(labels.cpu().numpy())

# Calculate test accuracy (example metric for illustration)
all_predictions = np.concatenate(all_predictions, axis=0)
all_labels = np.concatenate(all_labels, axis=0)
accuracy = accuracy_score(all_labels, all_predictions)

print(f"Test Accuracy: {accuracy:.4f}")


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1: Average training loss = 0.24930515051142782
Test Accuracy: 0.0000


**Save the model**

In [17]:
import torch
import json
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

# Save paths
model_save_path = "distilbert_multilabel_model.pt"
tokenizer_save_path = "distilbert_tokenizer/"
all_classes_save_path = "all_classes.json"

# Assuming all_classes is defined and contains all your classes
all_classes = list(all_classes)

# Save the model and tokenizer
torch.save(model.state_dict(), model_save_path)
tokenizer.save_pretrained(tokenizer_save_path)

# Save all_classes to a JSON file
with open(all_classes_save_path, 'w') as f:
    json.dump(all_classes, f)

In [None]:
!pip install streamlit

**Prediction Code**

In [18]:
import torch
import json
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
# Load paths
model_load_path = "distilbert_multilabel_model.pt"
tokenizer_load_path = "distilbert_tokenizer/"
all_classes_load_path = "all_classes.json"

# Load all_classes
with open(all_classes_load_path, 'r') as f:
    all_classes = json.load(f)

# Load tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained(tokenizer_load_path)
num_labels = len(all_classes)
model_name = 'distilbert-base-uncased'  # Ensure the model name is defined
model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

# Load the state dictionary into the model
state_dict = torch.load(model_load_path, map_location=torch.device('cpu'))
model.load_state_dict(state_dict)

model.eval()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

def predict_labels(texts, model, tokenizer, device):
    encodings = tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=256,
        return_tensors='pt'
    )
    input_ids = encodings['input_ids'].to(device)
    attention_mask = encodings['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = (logits > 0.0).float()

    return predictions.cpu().numpy()

# Example new input texts
Product = input('Enter Product Name: ')
Lifecycle = input('Enter the Lifecycle: ')
Scenario = input('Enter the test case scenario: ')
new_texts = [Product + ' ' + Lifecycle + ' ' + Scenario]  # Wrap in a list to keep consistent

# Predict labels
predicted_labels = predict_labels(new_texts, model, tokenizer, device)

# Fit the label_encoder with all_classes
label_encoder = MultiLabelBinarizer(classes=all_classes)
label_encoder.fit([all_classes])  # Fit with all possible classes

# Inverse transform the predicted labels
predicted_classes = label_encoder.inverse_transform(predicted_labels)

# Output predictions
for i, text in enumerate(new_texts):
    #print(f"Text: {text}")
    print(f"Predicted Labels: {predicted_classes[i]}")



Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Enter Product Name: h
Enter the Lifecycle: k
Enter the test case scenario: l
Predicted Labels: ('DEPOSIT',)


In [None]:
!pip install pyngrok
import os

**App.py for streamlit**

In [22]:
with open('app.py', 'w') as f:
    f.write('''
import streamlit as st
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from sklearn.preprocessing import MultiLabelBinarizer
import json

# Load paths and all_classes
model_load_path = "distilbert_multilabel_model.pt"
tokenizer_load_path = "distilbert_tokenizer/"
all_classes_load_path = "all_classes.json"

with open(all_classes_load_path, 'r') as f:
    all_classes = json.load(f)

# Load tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained(tokenizer_load_path)
num_labels = len(all_classes)
model_name = 'distilbert-base-uncased'  # Ensure the model name is defined
model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

# Load the state dictionary into the model
state_dict = torch.load(model_load_path, map_location=torch.device('cpu'))
model.load_state_dict(state_dict)

model.eval()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Prediction function
def predict_labels(texts, model, tokenizer, device):
    encodings = tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=256,
        return_tensors='pt'
    )
    input_ids = encodings['input_ids'].to(device)
    attention_mask = encodings['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = (logits > 0.0).float()

    return predictions.cpu().numpy()

# Streamlit app interface
st.title("Multi-Label Classification with DistilBERT")

st.write("Enter the following details:")

# Input fields
product_name = st.text_input("Product Name")
lifecycle = st.text_input("Lifecycle")
test_scenario = st.text_area("Test Case Scenario")

# Predict button
if st.button("Predict"):
    if product_name and lifecycle and test_scenario:
        new_texts = [product_name + ' ' + lifecycle + ' ' + test_scenario]
        predicted_labels = predict_labels(new_texts, model, tokenizer, device)

        # Fit the label_encoder with all_classes
        label_encoder = MultiLabelBinarizer(classes=all_classes)
        label_encoder.fit([all_classes])  # Fit with all possible classes

        # Inverse transform the predicted labels
        predicted_classes = label_encoder.inverse_transform(predicted_labels)

        # Output predictions
        st.write("Predicted Labels:")
        for labels in predicted_classes:
            st.write(labels)
    else:
        st.write("Please fill in all fields to predict.")

    ''')


**Deploy streamlit**

In [None]:
from pyngrok import ngrok
import os

# Set your ngrok authtoken
ngrok.set_auth_token("2i3uBlQsW8ANME63cykolRshNKy_6uEocuvMbGp47QwH9ygB6")

# Run the Streamlit app in the background
os.system('app.py &')

# Create a public URL for the Streamlit app
public_url = ngrok.connect(8501)
print(f"Streamlit app is running at: {public_url}")


Streamlit app is running at: NgrokTunnel: "https://be08-34-125-45-99.ngrok-free.app" -> "http://localhost:8501"


In [23]:
from pyngrok import ngrok
import subprocess

# Set your ngrok authtoken
ngrok.set_auth_token("2i3uBlQsW8ANME63cykolRshNKy_6uEocuvMbGp47QwH9ygB6")

# Run the Streamlit app in the background
process = subprocess.Popen(['streamlit', 'run', 'app.py'])

# Create a public URL for the Streamlit app
public_url = ngrok.connect(8501)
print(f"Streamlit app is running at: {public_url}")

# Optional: Keep the script running to maintain the ngrok connection
try:
    process.wait()
except KeyboardInterrupt:
    process.terminate()


Streamlit app is running at: NgrokTunnel: "https://1792-34-16-200-196.ngrok-free.app" -> "http://localhost:8501"
