# Machine Learning Model for Classifying & Critiquing Software Requirements

### Import required libraries (Install if any package is not present)

In [1]:
!pip install fasttext
!pip install openai==0.28



In [2]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer
from gensim.models import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import joblib
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler
from collections import Counter
import fasttext
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, SpatialDropout1D
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertForSequenceClassification, AdamW,  Trainer, TrainingArguments
from torch.utils.data import DataLoader, TensorDataset
import torch
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
import openai

### Data Preparation

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import pandas as pd

def load_and_preprocess_data(file_path1, file_path2):
    """
    Load and preprocess data from two CSV files, then combine them into a single DataFrame.

    Args:
        file_path1 (str): Path to the first CSV file.
        file_path2 (str): Path to the second CSV file.

    Returns:
        pd.DataFrame: Combined and preprocessed DataFrame.
    """
    # Load data from CSV files
    data1 = pd.read_csv(file_path1)
    data2 = pd.read_csv(file_path2)

    # Select relevant columns and rename them for consistency
    data1 = data1[['Label', 'Document']]
    data1 = data1.rename(columns={'Label': 'Type', 'Document': 'Requirement'})

    # Concatenate the two datasets and remove duplicates
    df = pd.concat([data1, data2])
    df = df.drop_duplicates().reset_index(drop=True)

    # Standardize the 'Type' column
    df['Type'] = df['Type'].apply(lambda x: 'F' if x in ['F', 'FR'] else ('UX' if x in ['LF', 'US'] else 'NFR'))

    return df

# Example usage
file_path1 = '/content/drive/MyDrive/MLassignment-Vyshnavi/nfr.csv'
file_path2 = '/content/drive/MyDrive/MLassignment-Vyshnavi/software_requirements_extended.csv'
df = load_and_preprocess_data(file_path1, file_path2)
df


Unnamed: 0,Type,Requirement
0,NFR,The system shall refresh the display every 60 ...
1,UX,The application shall match the color of the s...
2,UX,If projected the data must be readable. On ...
3,NFR,The product shall be available during normal ...
4,UX,If projected the data must be understandable...
...,...,...
1004,F,There will be a designated phone number that u...
1005,F,Texts sent to that number will be sent to the ...
1006,F,"If a question is not understood by our API, th..."
1007,F,Upon the USB being plugged in the system shall...


### Count the number of occurrences of each unique value in the 'Type' column.

In [5]:
# This provides a quick overview of the distribution of different requirement types in the dataset.
df['Type'].value_counts()

Type
F      525
NFR    380
UX     104
Name: count, dtype: int64

### Download necessary NLTK resources

In [6]:
def download_nltk_resources():
    """
    Download necessary NLTK resources for text preprocessing:

    - 'punkt': Tokenizer that divides a text into a list of sentences or words.
    - 'stopwords': Collection of common words (e.g., 'the', 'is', 'in') that are usually removed in text preprocessing.
    - 'wordnet': Large lexical database of English, which can be used for finding synonyms, antonyms, and word definitions.

    Returns:
        None
    """
    nltk.download('punkt')
    nltk.download('stopwords')
    nltk.download('wordnet')

# Call the function to download NLTK resources
download_nltk_resources()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


### Data Cleaning: Text Preprocessing

In [7]:
# Initialize the WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# Define stop words
stop_words = set(stopwords.words('english'))

def clean_text(text):
    """
    Cleans the input text by performing the following steps:

    1. Converts the text to lowercase.
    2. Removes punctuation and special characters.
    3. Tokenizes the text into individual words.
    4. Removes stop words and lemmatizes the remaining words.
    5. Joins the cleaned tokens back into a single string.

    Args:
        text (str): The input text to be cleaned.

    Returns:
        str: The cleaned text.
    """
    # Convert to lowercase
    text = text.lower()

    # Remove punctuation and special characters
    text = re.sub(r'[^a-z\s]', '', text)

    # Tokenize
    tokens = word_tokenize(text)

    # Remove stop words and lemmatize
    cleaned_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]

    # Join tokens back into a single string
    cleaned_text = ' '.join(cleaned_tokens)

    return cleaned_text

# Apply the cleaning function to the 'Requirement' column
df['Cleaned_Requirement'] = df['Requirement'].apply(clean_text)

# Print cleaned data for verification
print(df[['Type', 'Cleaned_Requirement']])

     Type                                Cleaned_Requirement
0     NFR          system shall refresh display every second
1      UX  application shall match color schema set forth...
2      UX  projected data must readable x projection scre...
3     NFR  product shall available normal business hour l...
4      UX  projected data must understandable x projectio...
...   ...                                                ...
1004    F             designated phone number user send text
1005    F  text sent number sent api system reply user an...
1006    F  question understood api system send text conta...
1007    F  upon usb plugged system shall able deployed op...
1008    F  system shall able handle customer logged concu...

[1009 rows x 2 columns]


 ### Train Test splits

In [8]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['Cleaned_Requirement'], df['Type'], test_size=0.3, random_state=42,stratify=df['Type'])
print(y_train.value_counts(),y_test.value_counts())


Type
F      367
NFR    266
UX      73
Name: count, dtype: int64 Type
F      158
NFR    114
UX      31
Name: count, dtype: int64


## Model Training

### Off-the-Shelf Models

### BERT Model Fine-Tuning and Classification

#### Split Data
- Split the data into training and testing sets.

#### Load BERT Model and Tokenizer
- Load the pre-trained BERT model and tokenizer for sequence classification with 3 output classes: F, NFR, UX.

#### Tokenize Text Data
- Tokenize the text data for both training and testing sets, padding sequences to a max length of 128.

#### Convert Labels to Integers
- Convert the labels to integers using a label map.

#### Create DataLoader
- Create DataLoader for both training and testing sets.

#### Fine-Tune BERT Model
- Fine-tune the BERT model on the training set for 10 epochs.

#### Evaluate the Fine-Tuned Model
- Evaluate the fine-tuned model on the testing set and collect predictions and true labels.

#### Results
- Print the accuracy, classification report, and confusion matrix.


In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['Cleaned_Requirement'], df['Type'], test_size=0.3, random_state=5, stratify=df['Type'])

# Load the pre-trained BERT model and tokenizer
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)  # 3 output classes: F, NFR, UX
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize text data
X_train_tokenized = tokenizer(list(X_train), padding=True, truncation=True, max_length=128, return_tensors='pt')
X_test_tokenized = tokenizer(list(X_test), padding=True, truncation=True, max_length=128, return_tensors='pt')

# Convert labels to integers
label_map = {label: i for i, label in enumerate(df['Type'].unique())}
y_train_encoded = torch.tensor([label_map[label] for label in y_train])
y_test_encoded = torch.tensor([label_map[label] for label in y_test])

# Create DataLoader for training and testing sets
train_dataset = TensorDataset(X_train_tokenized['input_ids'], X_train_tokenized['attention_mask'], y_train_encoded)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_dataset = TensorDataset(X_test_tokenized['input_ids'], X_test_tokenized['attention_mask'], y_test_encoded)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False)

# Fine-tune the BERT model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)
model.train()
for epoch in range(10):
    for batch in train_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

# Evaluate the fine-tuned model
model.eval()
predictions = []
true_labels = []
with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions.extend(torch.argmax(logits, axis=1).cpu().numpy())
        true_labels.extend(labels.cpu().numpy())



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
test_labels = [label_map[label] for label in y_test]
# Convert integer labels back to original labels
label_map_inverse = {0: 'NFR', 1: 'UX', 2: 'F'}
predicted_labels = [label_map_inverse[label] for label in predictions]
true_labels = [label_map_inverse[label] for label in test_labels]

# Save the fine-tuned BERT model
model.save_pretrained("fine_tuned_bert_model")

# Save the tokenizer
tokenizer.save_pretrained("fine_tuned_bert_model/tokenizer")

# Evaluate the model's performance
accuracy = accuracy_score(true_labels, predicted_labels)
print(f'Accuracy: {accuracy * 100:.2f}%')
report = classification_report(true_labels, predicted_labels, target_names=['F', 'NFR', 'UX'])
conf_matrix = confusion_matrix(true_labels, predicted_labels, labels=['F', 'NFR', 'UX'])

print("Classification Report:")
print(report)
print("Confusion Matrix:")
print(conf_matrix)

Accuracy: 89.11%
Classification Report:
              precision    recall  f1-score   support

           F       0.92      0.89      0.90       158
         NFR       0.85      0.89      0.87       114
          UX       0.93      0.87      0.90        31

    accuracy                           0.89       303
   macro avg       0.90      0.89      0.89       303
weighted avg       0.89      0.89      0.89       303

Confusion Matrix:
[[141  16   1]
 [ 11 102   1]
 [  2   2  27]]


### Custom model training

### Fasttext Model Training


In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['Cleaned_Requirement'], df['Type'], test_size=0.3, random_state=5, stratify=df['Type'])
# Prepare training data in fastText format
X_train_array = np.array(X_train)
X_train_flattened = X_train_array.flatten()
train_data_fasttext = []
for text, label in zip(X_train_flattened, y_train):
    train_data_fasttext.append(f"__label__{label} {text}")

# Write the training data to a file
with open('train_data.txt', 'w') as f:
    for item in train_data_fasttext:
        f.write("%s\n" % item)

# Train the fastText model

model = fasttext.train_supervised(input='train_data.txt', epoch=100, lr=0.1, wordNgrams=2)

# Predict the labels for the test set
y_pred = [model.predict(text)[0][0].split('__label__')[1] for text in X_test]

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')
report = classification_report(y_test, y_pred, target_names=['F', 'NFR', 'UX'])
conf_matrix = confusion_matrix(y_test, y_pred, labels=["F", "NFR", "UX"])

print("Classification Report:")
print(report)
print("Confusion Matrix:")
print(conf_matrix)

Accuracy: 86.14%
Classification Report:
              precision    recall  f1-score   support

           F       0.87      0.92      0.90       158
         NFR       0.84      0.88      0.86       114
          UX       0.88      0.48      0.62        31

    accuracy                           0.86       303
   macro avg       0.87      0.76      0.79       303
weighted avg       0.86      0.86      0.86       303

Confusion Matrix:
[[146  11   1]
 [ 13 100   1]
 [  8   8  15]]


### SVM Model Training and Classification with Doc2Vec

#### Define Parameter Grid
- Define a parameter grid for GridSearchCV to optimize hyperparameters for SVM.

#### Split Data and Train SVM Model
- Split the data into training and testing sets.
- Perform GridSearchCV to find the best hyperparameters for SVM.

#### Save the Trained Model
- Save the trained SVM model using joblib.

#### Make Predictions and Evaluate Model
- Make predictions on the test set.
- Evaluate the model's performance using accuracy, classification report, and confusion matrix.

#### Results
- Print the best parameters found by GridSearchCV, best cross-validation score, accuracy, classification report, and confusion matrix.


In [20]:
param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['rbf'],
    'gamma': ['scale', 'auto']
}

tagged_documents = [TaggedDocument(doc.split(), [i]) for i, doc in enumerate(df['Cleaned_Requirement'])]

# Initialize and train the Doc2Vec model using Distributed Memory (DM)
model_dm = Doc2Vec(vector_size=20, window=4, min_count=1, workers=4, epochs=100, dm=1)
model_dm.build_vocab(tagged_documents)
model_dm.train(tagged_documents, total_examples=model_dm.corpus_count, epochs=model_dm.epochs)

# Initialize and train the Doc2Vec model using Distributed Bag of Words (DBOW)
model_dbow = Doc2Vec(vector_size=20, window=2, min_count=1, workers=4, epochs=100, dm=0)
model_dbow.build_vocab(tagged_documents)
model_dbow.train(tagged_documents, total_examples=model_dbow.corpus_count, epochs=model_dbow.epochs)

# Transform documents into vectors using both trained Doc2Vec models
document_vectors_dm = [model_dm.infer_vector(doc.split()) for doc in df['Cleaned_Requirement']]
document_vectors_dbow = [model_dbow.infer_vector(doc.split()) for doc in df['Cleaned_Requirement']]

# Concatenate the vectors from both models
document_vectors_combined = [list(dm_vec) + list(dbow_vec) for dm_vec, dbow_vec in zip(document_vectors_dm, document_vectors_dbow)]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(document_vectors_combined, df['Type'], random_state=10, test_size=0.3,stratify=df['Type'])

grid_search = GridSearchCV(estimator=SVC(), param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)
# Make predictions on the test set
y_pred = grid_search.predict(X_test)


joblib.dump(grid_search, "svm_model.pkl")

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print(f'Combined DM and DBOW Accuracy: {accuracy * 100:.2f}%')
report = classification_report(y_test, y_pred, target_names=['F', 'NFR', 'UX'])
conf_matrix = confusion_matrix(y_test, y_pred, labels=["F", "NFR", "UX"])

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)
print("Confusion Matrix:")
print(conf_matrix)


Best parameters: {'C': 10, 'gamma': 'auto', 'kernel': 'rbf'}
Best cross-validation score: 0.798851263610029
Combined DM and DBOW Accuracy: 82.18%
Accuracy: 0.8217821782178217
Classification Report:
              precision    recall  f1-score   support

           F       0.83      0.91      0.86       158
         NFR       0.82      0.76      0.79       114
          UX       0.79      0.61      0.69        31

    accuracy                           0.82       303
   macro avg       0.81      0.76      0.78       303
weighted avg       0.82      0.82      0.82       303

Confusion Matrix:
[[143  12   3]
 [ 25  87   2]
 [  5   7  19]]


Overall, the BERT model is performing well in predicting three types of software requirements, achieving an accuracy of 89%.

## Model Selection

#### Compare the performance of custom models and off-the-shelf models

For this project, both custom and off-the-shelf models were explored to predict software requirements accurately. The custom models included a variety of techniques such as SVM, logistic regression, Naive Bayes, LSTM, TF-IDF, Doc2Vec, and BOW. In contrast, the off-the-shelf models considered was BERT.

Among the custom models, fasttext showed notable performance. This combination leveraged the strengths of tasttext for classification and, resulting in satisfactory predictions.

For off-the-shelf models, BERT emerged as the top performer. BERT, with its deep learning architecture and pre-trained language model capabilities, provided superior performance in understanding the context and nuances of the software requirements.

#### Select the best-performing model based on evaluation metrics

After a thorough comparison, BERT was identified as the best-performing model overall. It excelled in predicting all types of software requirements with an impressive accuracy of 89.11%. The comprehensive evaluation metrics further highlighted BERT's effectiveness.

**Evaluation Metrics:**

**Accuracy:** 89.11%

**Classification Report:**

          precision    recall  f1-score   support

    F       0.92      0.89      0.90       158
    NFR     0.85      0.89      0.87       114
    UX      0.93      0.87      0.90        31

accuracy    -                       0.89       



### Generate Feedback for Software Requirements

This code uses OpenAI's GPT-3.5-turbo model to generate feedback for software requirements. It takes a requirement and its label as input and provides constructive feedback, suggestions for improvement, identifies potential issues, or seeks clarification if needed.



In [7]:
# Set your OpenAI API key
openai.api_key = 'sk-proj-x1jqzAcp5BOnYCfM3EQJT3BlbkFJNl9PwbgwNSGKZ7mppBDD'

def generate_feedback(requirement, label):
    input_text = f"Requirement: {requirement}\nLabel: {label}\nFeedback: Provide suggestions for improvement, identify any issues, or clarify if needed."

    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are an expert in software requirements and user experience. Provide constructive feedback for the given requirement."},
            {"role": "user", "content": input_text}
        ],
        max_tokens=150,
        n=1,
        stop=None,
        temperature=1
    )

    feedback = response.choices[0].message["content"].strip()
    return feedback

# Example usage
a = generate_feedback("The product shall be intuitive and self-explanatory", "User Experience")
print(a)


This requirement is a good starting point for ensuring a positive user experience, but it lacks specific details on how the product will achieve being intuitive and self-explanatory. To make this requirement more effective, consider adding measurable criteria or examples to clarify what "intuitive" means in the context of the product. 

For example, you could define specific user actions that should be straightforward with minimal guidance, or include target metrics for user onboarding success rates. Additionally, consider involving actual users in the testing or design process to gather feedback on whether the product meets their expectations for intuitiveness and self-explanatory features. This approach will help ensure that the user experience is truly intuitive and satisfying.


## Classification and Feedback Generation

In [18]:
from transformers import BertForSequenceClassification, BertTokenizer

# Load the fine-tuned BERT model
model = BertForSequenceClassification.from_pretrained('fine_tuned_bert_model')

# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained('fine_tuned_bert_model/tokenizer')

# Example usage
sentence = "The product shall be intuitive and self-explanatory."
inputs = tokenizer(sentence, return_tensors='pt')
outputs = model(**inputs)
predicted_label_id = torch.argmax(outputs.logits, dim=1).item()
predicted_label = model.config.id2label[predicted_label_id]
label_map = {'LABEL_0': 'NFR', 'LABEL_1': 'UX', 'LABEL_2': 'F'}
# print(f"Predicted Label: {label_map[predicted_label]}")

type_map = {'NFR': 'Non functional requirement', 'UX': 'User Experience', 'F': 'Functional requirement'}
label=label_map[predicted_label]
print(f"Predicted Label: {type_map[label]}({label_map[predicted_label]})")
# Generate feedback
a = generate_feedback(sentence, type_map[label])
print(f"Feedback: {a}")

Predicted Label: User Experience(UX)
Feedback: Feedback:
1. Define what "intuitive and self-explanatory" means in the context of your product. What specific actions, features, or elements should be intuitive for users? 
2. Consider conducting user testing or user research to validate if the product is indeed intuitive and self-explanatory for your target users. 
3. Provide clear onboarding instructions or tooltips to guide users through the product, especially for complex or new features. 
4. Use consistent design patterns and language throughout the product to enhance usability and familiarity for users. 
5. Seek feedback from users regularly to identify any areas of improvement in the product's intuitiveness and self-explanatory nature. 
6. Continuously iterate and refine the user experience based on user


In [19]:
sentence = "The product shall be available 99% of the time. Rationale"
inputs = tokenizer(sentence, return_tensors='pt')
outputs = model(**inputs)
predicted_label_id = torch.argmax(outputs.logits, dim=1).item()
predicted_label = model.config.id2label[predicted_label_id]
label_map = {'LABEL_0': 'NFR', 'LABEL_1': 'UX', 'LABEL_2': 'F'}
# print(f"Predicted Label: {label_map[predicted_label]}")

type_map = {'NFR': 'Non functional requirement', 'UX': 'User Experience', 'F': 'Functional requirement'}
label=label_map[predicted_label]
print(f"Predicted Label: {type_map[label]}({label_map[predicted_label]})")
# Generate feedback
a = generate_feedback(sentence, type_map[label])
print(f"Feedback: {a}")

Predicted Label: Non functional requirement(NFR)
Feedback: Feedback:
1. Define what "availability" means here - Does it mean the system is accessible to users or fully functional during that time?
2. Specify whether this availability target includes planned maintenance periods or only unexpected downtime.
3. Consider including performance metrics such as response time or uptime percentage in addition to availability.
4. Ensure that this requirement aligns with business needs and user expectations - 99% availability might be too low for critical applications.
5. Consider adding a contingency plan in case the requirement is not met, such as notifying users or providing alternative access during downtime.
6. Consider defining how availability will be measured and monitored to ensure this requirement is met consistently.


In [21]:
sentence = "A search described in requirement UCAR602 results in a displayed set of topics."
inputs = tokenizer(sentence, return_tensors='pt')
outputs = model(**inputs)
predicted_label_id = torch.argmax(outputs.logits, dim=1).item()
predicted_label = model.config.id2label[predicted_label_id]
label_map = {'LABEL_0': 'NFR', 'LABEL_1': 'UX', 'LABEL_2': 'F'}
# print(f"Predicted Label: {label_map[predicted_label]}")

type_map = {'NFR': 'Non functional requirement', 'UX': 'User Experience', 'F': 'Functional requirement'}
label=label_map[predicted_label]
print(f"Predicted Label: {type_map[label]}({label_map[predicted_label]})")
# Generate feedback
a = generate_feedback(sentence, type_map[label])
print(f"Feedback: {a}")

Predicted Label: Functional requirement(F)
Feedback: Feedback:
1. The requirement is quite vague and lacks specificity. It would be helpful to define what "search" involves - such as what can be searched for, where the search takes place, and what filters or options are available to the user.
2. It would be beneficial to clarify the expected scope and contents of the "set of topics" that are displayed as a result of the search. Are these topics related to the search query, sorted in a specific order, or selected based on some criteria?
3. Consider incorporating information on how users can interact with the displayed topics – for example, can they filter, sort, or further explore these topics?
4. Ensure that the requirement aligns with user needs and expectations, and that it contributes


## Conclusion

In the analysis conducted in this notebook, the best results were achieved using the BERT-based model for classifying software requirements into functional (F), non-functional (NFR), and user experience (UX) categories. The BERT model outperformed other models such as SVM, fasttext, LSTM, Naive bayes in terms of accuracy and overall performance.

The BERT model demonstrated superior performance due to its ability to capture contextual information and semantic meaning effectively, especially for the complex and varied language found in software requirements. By leveraging pre-trained language representations, BERT was able to understand the nuances of the requirements and classify them accurately.

Additionally, the BERT model benefited from fine-tuning on the specific classification task, which further improved its performance. The fine-tuning process allowed the model to adapt to the nuances of the dataset, resulting in better classification accuracy.

In conclusion, the BERT-based model proved to be the most effective for classifying software requirements in this analysis, showcasing its capabilities in natural language understanding and classification tasks. And feedback is generated for the software requirements.

### Other Experiments with Custom models

### LSTM Model Training and Classification with Tokenization

#### Tokenization
- Tokenize the cleaned requirements with a limit of 10,000 words.
- Convert the tokenized sequences to sequences of integers.
- Pad the sequences to ensure uniform length.

#### Split Data and Convert Labels
- Split the data into training and testing sets.
- Convert labels to integers using LabelEncoder.

#### Build and Compile LSTM Model
- Build an LSTM model with an embedding layer, LSTM layer, and dense softmax layer.
- Compile the model using 'adam' optimizer and 'sparse_categorical_crossentropy' loss.

#### Train the Model
- Train the model on the training data for 50 epochs with a batch size of 32.

#### Make Predictions and Evaluate Model
- Make predictions on the test set.
- Evaluate the model's performance using accuracy, classification report, and confusion matrix.

#### Results
- Print the accuracy, classification report, and confusion matrix.


In [59]:
tokenizer = Tokenizer(num_words=10000)  # Use the top 10,000 words
tokenizer.fit_on_texts(df["Cleaned_Requirement"])
sequences = tokenizer.texts_to_sequences(df["Cleaned_Requirement"])

# Pad the sequences to ensure uniform length
max_sequence_length = 20  # Choose a suitable maximum length for your data
data = pad_sequences(sequences, maxlen=max_sequence_length)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data, df['Type'], test_size=0.3, stratify=df['Type'])

# Convert labels to integers
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Build LSTM model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=128, input_length=max_sequence_length))  # Adjust output_dim and input_length as needed
model.add(SpatialDropout1D(0.2))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2,activation='relu'))
model.add(Dense(3, activation='softmax'))  # 3 output classes: F, NFR, UX

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train_encoded, epochs=50, batch_size=32, validation_data=(X_test, y_test_encoded))

# Convert labels to integers
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Make predictions
y_pred_prob = model.predict(X_test)
y_pred = np.argmax(y_pred_prob, axis=1)

# Evaluate the model
accuracy = accuracy_score(y_test_encoded, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')
report = classification_report(y_test_encoded, y_pred, target_names=['F', 'NFR', 'UX'])
conf_matrix = confusion_matrix(y_test_encoded, y_pred, labels=[0, 1, 2])  # Use integer labels

print("Classification Report:")
print(report)
print("Confusion Matrix:")
print(conf_matrix)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Accuracy: 82.84%
Classification Report:
              precision    recall  f1-score   support

           F       0.87      0.90      0.88       158
         NFR       0.78      0.84      0.81       114
          UX       0.76      0.42      0.54        31

    accuracy                           0.83       303
   macro avg       0.81      0.72      0.75       303
weighted avg       0.83      0.83      0.82

Observations: Although the accuracy is good, the model is not performing well for UX software requirements.

### Train a SVM Model with linear kernel using TF-IDF

**Classification:**
- Make predictions on the test set.
- Evaluate the model's performance using accuracy, classification report, and confusion matrix.


In [52]:
# Train a SVM model
classifier = SVC(kernel='linear')
classifier.fit(X_train_tfidf, y_train)

# Make predictions on the test set
y_pred = classifier.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)

report = classification_report(y_test, y_pred, target_names=['F', 'NFR', 'UX'])
conf_matrix = confusion_matrix(y_test, y_pred, labels=["F", "NFR", "UX"])

print(f"Accuracy: {accuracy}")
# print(f"f1 score: {f1}")
print(f"Classification Report:\n{report}")

Accuracy: 0.8316831683168316
Classification Report:
              precision    recall  f1-score   support

           F       0.85      0.92      0.88       158
         NFR       0.79      0.82      0.80       114
          UX       0.93      0.45      0.61        31

    accuracy                           0.83       303
   macro avg       0.86      0.73      0.76       303
weighted avg       0.84      0.83      0.82       303



### Train a SVM model with linear kernel using BOW

**Text Preprocessing:**
- Convert text data to numerical features using BOW (Bag Of Words).
- Train a SVM Model with linear kernel.

**Classification:**
- Make predictions on the test set.
- Evaluate the model's performance using accuracy, classification report, and confusion matrix.




In [53]:
# Convert text data to numerical features using Bag-of-Words (CountVectorizer)
bow_vectorizer = CountVectorizer()
X_train_bow = bow_vectorizer.fit_transform(X_train)
X_test_bow = bow_vectorizer.transform(X_test)
classifier = SVC(kernel='linear')
classifier.fit(X_train_bow, y_train)

# Make predictions on the test set
y_pred = classifier.predict(X_test_bow)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=['F', 'NFR', 'UX'])
conf_matrix = confusion_matrix(y_test, y_pred, labels=["F", "NFR", "UX"])

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)
print("Confusion Matrix:")
print(conf_matrix)

Accuracy: 0.8085808580858086
Classification Report:
              precision    recall  f1-score   support

           F       0.85      0.91      0.88       158
         NFR       0.78      0.75      0.76       114
          UX       0.65      0.55      0.60        31

    accuracy                           0.81       303
   macro avg       0.76      0.73      0.75       303
weighted avg       0.80      0.81      0.81       303

Confusion Matrix:
[[143  15   0]
 [ 20  85   9]
 [  5   9  17]]


### Doc2Vec Model Training and Classification using Logistic Regression

#### Tag Documents for Doc2Vec
- Tag documents with indices for training.

#### Initialize and Train Doc2Vec Models
- Initialize and train two Doc2Vec models: Distributed Memory (DM) and Distributed Bag of Words (DBOW).

#### Transform Documents into Vectors
- Transform documents into vectors using both trained Doc2Vec models.

#### Concatenate Vectors
- Concatenate the vectors from both models.

#### Split Data and Train Logistic Regression Model
- Split the data into training and testing sets.
- Initialize and train a logistic regression model.

#### Make Predictions and Evaluate Model
- Make predictions on the test set.
- Evaluate the model's performance using accuracy, classification report, and confusion matrix.

#### Results
- Print the accuracy, classification report, and confusion matrix.


In [54]:
# Tag documents for Doc2Vec
tagged_documents = [TaggedDocument(doc.split(), [i]) for i, doc in enumerate(df['Cleaned_Requirement'])]

# Initialize and train the Doc2Vec model using Distributed Memory (DM)
model_dm = Doc2Vec(vector_size=20, window=4, min_count=1, workers=4, epochs=100, dm=1)
model_dm.build_vocab(tagged_documents)
model_dm.train(tagged_documents, total_examples=model_dm.corpus_count, epochs=model_dm.epochs)

# Initialize and train the Doc2Vec model using Distributed Bag of Words (DBOW)
model_dbow = Doc2Vec(vector_size=20, window=2, min_count=1, workers=4, epochs=100, dm=0)
model_dbow.build_vocab(tagged_documents)
model_dbow.train(tagged_documents, total_examples=model_dbow.corpus_count, epochs=model_dbow.epochs)

# Transform documents into vectors using both trained Doc2Vec models
document_vectors_dm = [model_dm.infer_vector(doc.split()) for doc in df['Cleaned_Requirement']]
document_vectors_dbow = [model_dbow.infer_vector(doc.split()) for doc in df['Cleaned_Requirement']]

# Concatenate the vectors from both models
document_vectors_combined = [list(dm_vec) + list(dbow_vec) for dm_vec, dbow_vec in zip(document_vectors_dm, document_vectors_dbow)]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(document_vectors_combined, df['Type'], test_size=0.2,stratify=df['Type'])

# Initialize and train the logistic regression model
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = classifier.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print(f'Combined DM and DBOW Accuracy: {accuracy * 100:.2f}%')
report = classification_report(y_test, y_pred, target_names=['F', 'NFR', 'UX'])
conf_matrix = confusion_matrix(y_test, y_pred, labels=["F", "NFR", "UX"])

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)
print("Confusion Matrix:")
print(conf_matrix)

Combined DM and DBOW Accuracy: 76.24%
Accuracy: 0.7623762376237624
Classification Report:
              precision    recall  f1-score   support

           F       0.80      0.90      0.85       105
         NFR       0.77      0.67      0.72        76
          UX       0.47      0.43      0.45        21

    accuracy                           0.76       202
   macro avg       0.68      0.66      0.67       202
weighted avg       0.76      0.76      0.76       202

Confusion Matrix:
[[94  7  4]
 [19 51  6]
 [ 4  8  9]]


### Doc2Vec Model Training and Classification using SVM with rbf kernel

#### Tag Documents for Doc2Vec
- Tag documents with indices for training.

#### Initialize and Train Doc2Vec Models
- Initialize and train two Doc2Vec models: Distributed Memory (DM) and Distributed Bag of Words (DBOW).

#### Transform Documents into Vectors
- Transform documents into vectors using both trained Doc2Vec models.

#### Concatenate Vectors
- Concatenate the vectors from both models.

#### Split Data and Train SVM Model
- Split the data into training and testing sets.
- Initialize and train a SVM model with rbf kernel.

#### Make Predictions and Evaluate Model
- Make predictions on the test set.
- Evaluate the model's performance using accuracy, classification report, and confusion matrix.

#### Results
- Print the accuracy, classification report, and confusion matrix.


In [73]:
# Tag documents for Doc2Vec
tagged_documents = [TaggedDocument(doc.split(), [i]) for i, doc in enumerate(df['Cleaned_Requirement'])]

# Initialize and train the Doc2Vec model using Distributed Memory (DM)
model_dm = Doc2Vec(vector_size=20, window=4, min_count=1, workers=4, epochs=100, dm=1)
model_dm.build_vocab(tagged_documents)
model_dm.train(tagged_documents, total_examples=model_dm.corpus_count, epochs=model_dm.epochs)

# Initialize and train the Doc2Vec model using Distributed Bag of Words (DBOW)
model_dbow = Doc2Vec(vector_size=20, window=2, min_count=1, workers=4, epochs=100, dm=0)
model_dbow.build_vocab(tagged_documents)
model_dbow.train(tagged_documents, total_examples=model_dbow.corpus_count, epochs=model_dbow.epochs)

# Transform documents into vectors using both trained Doc2Vec models
document_vectors_dm = [model_dm.infer_vector(doc.split()) for doc in df['Cleaned_Requirement']]
document_vectors_dbow = [model_dbow.infer_vector(doc.split()) for doc in df['Cleaned_Requirement']]

# Concatenate the vectors from both models
document_vectors_combined = [list(dm_vec) + list(dbow_vec) for dm_vec, dbow_vec in zip(document_vectors_dm, document_vectors_dbow)]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(document_vectors_combined, df['Type'], test_size=0.3, random_state= 10,stratify=df['Type'])

# Initialize and train the logistic regression model
classifier = SVC(kernel='rbf',class_weight='balanced')
classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = classifier.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print(f'Combined DM and DBOW Accuracy: {accuracy * 100:.2f}%')
report = classification_report(y_test, y_pred, target_names=['F', 'NFR', 'UX'])
conf_matrix = confusion_matrix(y_test, y_pred, labels=["F", "NFR", "UX"])

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)
print("Confusion Matrix:")
print(conf_matrix)

Combined DM and DBOW Accuracy: 82.84%
Accuracy: 0.8283828382838284
Classification Report:
              precision    recall  f1-score   support

           F       0.89      0.85      0.87       158
         NFR       0.78      0.82      0.80       114
          UX       0.70      0.74      0.72        31

    accuracy                           0.83       303
   macro avg       0.79      0.80      0.80       303
weighted avg       0.83      0.83      0.83       303

Confusion Matrix:
[[134  20   4]
 [ 14  94   6]
 [  2   6  23]]


### SVM Model Training and Classification with Doc2Vec and Oversampling

##### Split Data and Perform Oversampling
- Split the data into training and testing sets.
- Perform oversampling using SMOTE to balance the data.

##### Train SVM Model with Balanced Data
- Initialize and train the SVM model with the balanced data.

##### Make Predictions and Evaluate Model
- Make predictions on the test set.
- Evaluate the model's performance using accuracy, classification report, and confusion matrix.

##### Results
- Print the class distribution after SMOTE oversampling, accuracy, classification report, and confusion matrix.


In [56]:
# Tag documents for Doc2Vec
tagged_documents = [TaggedDocument(doc.split(), [i]) for i, doc in enumerate(df['Cleaned_Requirement'])]

# Initialize and train the Doc2Vec model using Distributed Memory (DM)
model_dm = Doc2Vec(vector_size=20, window=4, min_count=1, workers=4, epochs=100, dm=1)
model_dm.build_vocab(tagged_documents)
model_dm.train(tagged_documents, total_examples=model_dm.corpus_count, epochs=model_dm.epochs)

# Initialize and train the Doc2Vec model using Distributed Bag of Words (DBOW)
model_dbow = Doc2Vec(vector_size=20, window=2, min_count=1, workers=4, epochs=100, dm=0)
model_dbow.build_vocab(tagged_documents)
model_dbow.train(tagged_documents, total_examples=model_dbow.corpus_count, epochs=model_dbow.epochs)

# Transform documents into vectors using both trained Doc2Vec models
document_vectors_dm = [model_dm.infer_vector(doc.split()) for doc in df['Cleaned_Requirement']]
document_vectors_dbow = [model_dbow.infer_vector(doc.split()) for doc in df['Cleaned_Requirement']]

# Concatenate the vectors from both models
document_vectors_combined = [list(dm_vec) + list(dbow_vec) for dm_vec, dbow_vec in zip(document_vectors_dm, document_vectors_dbow)]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(document_vectors_combined, df['Type'], test_size=0.3,stratify=df['Type'])


# Instantiate the SMOTE
smote = SMOTE(random_state=42)
# Instantiate Random sampling
# ros = RandomOverSampler(random_state=42)

# Perform SMOTE oversampling
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# Check the class distribution after oversampling
print("Class distribution after SMOTE oversampling:", Counter(y_resampled))

# Initialize and train the logistic regression model
classifier = SVC(kernel='rbf',class_weight='balanced')
classifier.fit(X_resampled, y_resampled)

# Make predictions on the test set
y_pred = classifier.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print(f'Combined DM and DBOW Accuracy: {accuracy * 100:.2f}%')
report = classification_report(y_test, y_pred, target_names=['F', 'NFR', 'UX'])
conf_matrix = confusion_matrix(y_test, y_pred, labels=["F", "NFR", "UX"])

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)
print("Confusion Matrix:")
print(conf_matrix)

Class distribution after SMOTE oversampling: Counter({'NFR': 367, 'F': 367, 'UX': 367})
Combined DM and DBOW Accuracy: 80.20%
Accuracy: 0.801980198019802
Classification Report:
              precision    recall  f1-score   support

           F       0.88      0.82      0.85       158
         NFR       0.77      0.82      0.79       114
          UX       0.59      0.65      0.62        31

    accuracy                           0.80       303
   macro avg       0.75      0.76      0.75       303
weighted avg       0.81      0.80      0.80       303

Confusion Matrix:
[[130  20   8]
 [ 15  93   6]
 [  3   8  20]]
