In [None]:
import numpy as np
import pandas as pd

In [None]:
df = pd.read_csv('software_requirements_extended.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.columns

**Handling missing values**

In [None]:
# Check Missing values
df.isnull().sum()

In [None]:
df.isnull().sum() / len(df) * 100

In [None]:
# Check Duplicates
df.duplicated().sum()

In [None]:
# Remove duplicate requirements
df = df.drop_duplicates(subset=['Requirement'])


In [None]:
df = df.reset_index(drop=True)

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [None]:
# Download required resources
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("punkt_tab")
nltk.download("wordnet")

**Text Cleaning**

In [None]:
#Lowercasing
df['clean_text'] = df['Requirement'].str.lower()
print("After Lowercasing")
print(df[['Requirement','clean_text']], "\n")

In [None]:
#Remove Punctuation, Numbers & Special Characters
df['clean_text'] = df['clean_text'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))
print("After Removing Punctuation & Numbers")
print(df[['Requirement','clean_text']], "\n")

In [None]:
#Tokenization
df['tokens'] = df['clean_text'].apply(lambda x: word_tokenize(x))
print("After Tokenization")
print(df[['Requirement','tokens']], "\n")

In [None]:
#Stopword Removal
stop_words = set(stopwords.words('english'))
df['tokens'] = df['tokens'].apply(lambda x: [word for word in x if word not in stop_words])
print("After Stopword Removal")
print(df[['Requirement','tokens']], "\n")


In [None]:
#Lemmatization
lemmatizer = WordNetLemmatizer()
df['tokens'] = df['tokens'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
print("After Lemmatization")
print(df[['Requirement','tokens']], "\n")


In [None]:
#Join Back Tokens
df['clean_text'] = df['tokens'].apply(lambda x: " ".join(x))
print("Final Cleaned Text")
print(df[['Requirement','clean_text']], "\n")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

In [None]:
df['Type'].value_counts()

In [None]:
df["clean_text"] = df["Requirement"].astype(str).str.lower()


In [None]:
df['clean_text'] = df['clean_text'].astype(str)

# Map 'Type' to binary label: 1 for Functional (FR, F), 0 for Non-functional (all others)
functional_types = ['FR', 'F']
df['label'] = df['Type'].apply(lambda x: 1 if x in functional_types else 0)

print(df[['Type', 'label', 'clean_text']].head())

In [None]:
from sklearn.preprocessing import LabelEncoder

# Suppose your class column is named "Requirement"
# le = LabelEncoder()
# df["label"] = le.fit_transform(df["Requirement"]) # Remove this line as it overwrites the binary label

In [None]:
df["clean_text"] = df["Requirement"].astype(str).str.lower()


In [None]:
df = df.dropna(subset=["clean_text", "Requirement", "label"])


In [None]:
#seaborn
plt.figure(figsize=(6,4))
sns.countplot(x="label", data=df, palette="coolwarm")  # Use 'label' instead of 'target'
plt.title("Class Distribution (Functional=1, Non-Functional=0)")
plt.show()

In [None]:
#matplotlib
import matplotlib.pyplot as plt

# Class distribution
plt.figure(figsize=(8,5))
df["label"].value_counts().plot(kind="bar", color="skyblue")
plt.title("Class Distribution")
plt.xlabel("Labels")
plt.ylabel("Count")
plt.show()


**Count Plot**

In [None]:
#Distribution of Requirement Types (FR vs NFR)
plt.figure(figsize=(6,4))
sns.countplot(x='Type', data=df, palette="Set2")
plt.title("Distribution of Requirement Types (FR vs NFR)")
plt.xlabel("Requirement Type")
plt.ylabel("Count")
plt.show()


**Histogram**

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# pick column: prefer 'clean_text', otherwise fall back to 'Requirement'
col = 'clean_text' if 'clean_text' in df.columns else 'Requirement'

# compute length safely (handles NaN)
df['text_length'] = df[col].astype(str).str.split().str.len()

plt.figure(figsize=(6,4))
sns.histplot(df['text_length'], bins=30, kde=True, color="purple")
plt.title(f"Sentence Length Distribution ({col})")
plt.xlabel("Number of Words")
plt.ylabel("Frequency")
plt.show()


In [None]:
#Most Frequent Words
from collections import Counter
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

# Combine all requirements into one text
all_words = ' '.join(df['Requirement'].astype(str)).lower().split()

# Remove stopwords
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in all_words if word not in stop_words]

# Get most common words
common_words = Counter(filtered_words).most_common(15)


In [None]:
# Bar plot
words, counts = zip(*common_words)
plt.figure(figsize=(8,5))
sns.barplot(x=list(counts), y=list(words), palette="mako")
plt.title("Top 15 Most Frequent Words in Requirements")
plt.xlabel("Count")
plt.ylabel("Word")
plt.show()

In [None]:
#WordCloud for Visualization
wordcloud = WordCloud(width=800, height=400, background_color="white").generate(" ".join(filtered_words))

plt.figure(figsize=(10,6))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title("WordCloud of Software Requirements")
plt.show()


In [None]:
# WordCloud for Functional
func_text = " ".join(df[df["label"]==1]["clean_text"])
non_func_text = " ".join(df[df["label"]==0]["clean_text"])

plt.figure(figsize=(10,5))
wc = WordCloud(width=800, height=400, background_color="white").generate(func_text)
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.title("WordCloud - Functional Requirements")
plt.show()

In [None]:
# WordCloud for Non_Functional
plt.figure(figsize=(10,5))
wc = WordCloud(width=800, height=400, background_color="white").generate(non_func_text)
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.title("WordCloud - Non-Functional Requirements")
plt.show()

In [None]:
# Calculate IQR
Q1 = df['text_length'].quantile(0.25)
Q3 = df['text_length'].quantile(0.75)
IQR = Q3 - Q1

# Outlier condition
outliers = (df['text_length'] < (Q1 - 1.5 * IQR)) | (df['text_length'] > (Q3 + 1.5 * IQR))

# Remove outliers
df_no_outliers = df[~outliers].copy()

print("Original Data shape:", df.shape)
print("Data without Outliers shape:", df_no_outliers.shape)

In [None]:
# Normalization
from sklearn.preprocessing import MinMaxScaler
# Apply Min-Max Scaling
scaler = MinMaxScaler()
df_no_outliers['text_length_normalized'] = scaler.fit_transform(df_no_outliers[['text_length']])

print(df_no_outliers)

In [None]:
# Scaling
from sklearn.preprocessing import StandardScaler

# Apply Standard Scaling
scaler = StandardScaler()
df_no_outliers['text_length_scaled'] = scaler.fit_transform(df_no_outliers[['text_length']])

print(df_no_outliers)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import RandomForestClassifier

In [None]:
#Feature Engineering
vectorizer = TfidfVectorizer(stop_words='english', max_features=20)
X_tfidf = vectorizer.fit_transform(df['clean_text'])
feature_names = vectorizer.get_feature_names_out()

print("TF-IDF Features:\n", feature_names)
print("\nTF-IDF Matrix:\n", X_tfidf.toarray())

In [None]:
# Extra features: word count & char count
df["word_count"] = df["clean_text"].apply(lambda x: len(x.split()))
df["char_count"] = df["clean_text"].apply(len)

print(df[["clean_text", "word_count", "char_count"]].head())


In [None]:
#Feature Selection
selector = SelectKBest(score_func=chi2, k=10)  # top 10 features
X_selected = selector.fit_transform(X_tfidf, df['Type'])

selected_features = feature_names[selector.get_support()]
print("\nSelected Features (Chi2):", selected_features)

In [None]:
model = RandomForestClassifier(random_state=42)
model.fit(X_tfidf, df['Type'])

importances = model.feature_importances_
importance_df = pd.DataFrame({
    "Feature": feature_names,
    "Importance": importances
}).sort_values(by="Importance", ascending=False)

print("\nRandomForest Feature Importances:\n", importance_df)

In [None]:
# Features + Labels
X = df["clean_text"]
y = df["Type"]

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import pandas as pd

# Bag of Words
bow = CountVectorizer(stop_words="english")
X_bow = bow.fit_transform(df['clean_text'])  # Sparse matrix

# Convert sparse matrix to dense array and then to DataFrame (Optional, for inspection)
# X_bow_df = pd.DataFrame(X_bow.toarray(), columns=bow.get_feature_names_out())
# print(X_bow_df.head())

# Split data into training and testing sets for BOW features
X_train_bow, X_test_bow, y_train_bow, y_test_bow = train_test_split(X_bow, df['Type'], test_size=0.3, random_state=42)

print("Bag of Words features created and data split.")

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf, df["label"], test_size=0.2, random_state=42
)

print(X_train.shape, X_test.shape)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# Improved TF-IDF with more features, trigrams, and sublinear scaling
tfidf = TfidfVectorizer(
    max_features=30000,
    ngram_range=(1, 3),
    stop_words='english',
    sublinear_tf=True
)

X_tfidf = tfidf.fit_transform(df["clean_text"])

# ‚úÖ Convert to DataFrame for inspection
X_tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=tfidf.get_feature_names_out())

print(X_tfidf_df.head())


In [None]:
# Step 1: Drop NaN values from both X and y together
df = df.dropna(subset=["clean_text", "Type"])

# Step 2: Features aur labels banaye
X = df["clean_text"]
y_binary = df["Type"].apply(lambda x: 1 if x in functional_types else 0).values

print("Length of X:", len(X))
print("Length of y_binary:", len(y_binary))

# Step 3: Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y_binary, test_size=0.3, random_state=42, stratify=y_binary
)


In [None]:
X_test

In [None]:
X_train

In [None]:
y_test

In [None]:
y_train

**ML Model**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split
import pandas as pd

# 1. Convert text into TF-IDF features
tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))  # unigrams + bigrams
X_tfidf = tfidf.fit_transform(df["clean_text"])

# 2. Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf, df["label"], test_size=0.2, random_state=0
)

# 3. Train Logistic Regression model
lr_model = LogisticRegression(C=10, penalty="l2", solver="lbfgs", max_iter=2000)
lr_model.fit(X_train, y_train)

# 4. Predictions
y_pred_lr = lr_model.predict(X_test)

# 5. Evaluation Metrics (Stored in Variables)
acc_lr = accuracy_score(y_test, y_pred_lr)
prec_lr = precision_score(y_test, y_pred_lr, average='weighted')
rec_lr = recall_score(y_test, y_pred_lr, average='weighted')
f1_lr = f1_score(y_test, y_pred_lr, average='weighted')
cm_lr = confusion_matrix(y_test, y_pred_lr)

# ‚úÖ Duplicate variables for comparison table
y_pred_log = y_pred_lr
acc_log = acc_lr
prec_log = prec_lr
rec_log = rec_lr
f1_log = f1_lr

# 6. Display Results
print("Logistic Regression (Optimized) Performance:")
print(f" Accuracy  : {acc_lr:.4f}")
print(f" Precision : {prec_lr:.4f}")
print(f" Recall    : {rec_lr:.4f}")
print(f"F1-Score  : {f1_lr:.4f}")
print(" Confusion Matrix:\n", cm_lr)


In [None]:
#Cross-Validation (Example with Logistic Regression)
from sklearn.model_selection import cross_val_score
import numpy as np

log_reg_cv = LogisticRegression(max_iter=1000)
scores = cross_val_score(log_reg_cv, X_tfidf, df["label"], cv=5, scoring="accuracy")

print("Cross-validation scores:", scores)
print("Mean accuracy:", np.mean(scores))


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split

# Convert text into TF-IDF features (with unigrams + bigrams)
tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1,2))
X_tfidf = tfidf.fit_transform(df["clean_text"])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf, df["label"], test_size=0.2, random_state=42
)

# Naive Bayes with tuned alpha
nb_model = MultinomialNB(alpha=0.3)  # try 0.1, 0.3, 0.5
nb_model.fit(X_train, y_train)

# Prediction
y_pred_nb = nb_model.predict(X_test)

# Evaluation
print("\nNaive Bayes Performance:")
acc_nb = accuracy_score(y_test, y_pred_nb)
prec_nb = precision_score(y_test, y_pred_nb, average='weighted')
rec_nb = recall_score(y_test, y_pred_nb, average='weighted')
f1_nb = f1_score(y_test, y_pred_nb, average='weighted')

print("Accuracy:", round(acc_nb, 4))
print("Precision:", round(prec_nb, 4))
print("Recall:", round(rec_nb, 4))
print("F1 Score:", round(f1_nb, 4))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_nb))


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split

# Convert text to TF-IDF
tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1,2))
X_tfidf = tfidf.fit_transform(df["clean_text"])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf, df["label"], test_size=0.2, random_state=42
)

# Random Forest model (stronger settings but no hyperparameter grid)
rf_model = RandomForestClassifier(
    n_estimators=300,     # more trees ‚Üí better accuracy
    max_depth=40,         # deeper trees ‚Üí capture patterns
    random_state=42,
    n_jobs=-1
)
rf_model.fit(X_train, y_train)

# Prediction
y_pred_rf = rf_model.predict(X_test)

# Evaluation
print("\nRandom Forest Performance :")
acc_rf = accuracy_score(y_test, y_pred_rf)
prec_rf = precision_score(y_test, y_pred_rf, average='weighted')
rec_rf = recall_score(y_test, y_pred_rf, average='weighted')
f1_rf = f1_score(y_test, y_pred_rf, average='weighted')

print("Accuracy:", round(acc_rf, 4))
print("Precision:", round(prec_rf, 4))
print("Recall:", round(rec_rf, 4))
print("F1 Score:", round(f1_rf, 4))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Convert text to TF-IDF vectors
tfidf = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf.fit_transform(df["clean_text"])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf, df["label"], test_size=0.2, random_state=42
)

# Hyperparameter tuning
param_grid = {
    "n_estimators": [100, 200],
    "max_depth": [10, 20, None],
    "min_samples_split": [2, 5]
}

grid_rf = GridSearchCV(RandomForestClassifier(), param_grid, cv=3,
                       scoring="accuracy", n_jobs=-1)
grid_rf.fit(X_train, y_train)

print("Best Parameters:", grid_rf.best_params_)
print("Best Accuracy:", grid_rf.best_score_)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split

# Convert text to TF-IDF with unigrams + bigrams
tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1,2))
X_tfidf = tfidf.fit_transform(df["clean_text"])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf, df["label"], test_size=0.2, random_state=0
)

# Model
svm_model = LinearSVC(random_state=42)
svm_model.fit(X_train, y_train)

# Prediction
y_pred_svm = svm_model.predict(X_test)

# Evaluation
print("\nSVM (LinearSVC with TF-IDF) Performance:")
acc_svm = accuracy_score(y_test, y_pred_svm)
prec_svm = precision_score(y_test, y_pred_svm, average='weighted')
rec_svm = recall_score(y_test, y_pred_svm, average='weighted')
f1_svm = f1_score(y_test, y_pred_svm, average='weighted')

print("Accuracy:", round(acc_svm, 4))
print("Precision:", round(prec_svm, 4))
print("Recall:", round(rec_svm, 4))
print("F1 Score:", round(f1_svm, 4))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svm))


In [None]:
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Convert text to TF-IDF with unigrams + bigrams
tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1,2))
X_tfidf = tfidf.fit_transform(df["clean_text"])

# Encode labels
le = LabelEncoder()
y_enc = le.fit_transform(df["label"])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf, y_enc, test_size=0.2, random_state=0
)

# Model (simple setup, no heavy tuning)
xgb_model = XGBClassifier(eval_metric="mlogloss", use_label_encoder=False, random_state=42)
xgb_model.fit(X_train, y_train)

# Prediction
y_pred_xgb = xgb_model.predict(X_test)

# Evaluation
print("\nXGBoost Performance with TF-IDF:")
acc_xgb = accuracy_score(y_test, y_pred_xgb)
prec_xgb = precision_score(y_test, y_pred_xgb, average='weighted')
rec_xgb = recall_score(y_test, y_pred_xgb, average='weighted')
f1_xgb = f1_score(y_test, y_pred_xgb, average='weighted')

print("Accuracy:", round(acc_xgb, 4))
print("Precision:", round(prec_xgb, 4))
print("Recall:", round(rec_xgb, 4))
print("F1 Score:", round(f1_xgb, 4))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_xgb))


In [None]:
#Model Comparison Table
import pandas as pd

results = {
    "Model": ["Logistic Regression", "Random Forest", "Naive Bayes", "SVM", "XGBoost"],
    "Accuracy": [acc_log, acc_rf, acc_nb, acc_svm, acc_xgb],
    "Precision": [prec_log, prec_rf, prec_nb, prec_svm, prec_xgb],
    "Recall": [rec_log, rec_rf, rec_nb, rec_svm, rec_xgb],
    "F1-Score": [f1_log, f1_rf, f1_nb, f1_svm, f1_xgb]
}

results_df = pd.DataFrame(results)
print(results_df)


**DL Model**

In [None]:
# Libraries
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical # Import to_categorical
import numpy as np

# Tokenization & Padding
max_words = 2000       # Vocabulary size
max_len = 50           # Max sequence length

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(df['clean_text'])
sequences = tokenizer.texts_to_sequences(df['clean_text'])
X = pad_sequences(sequences, maxlen=max_len)

# Use the binary 'label' column (0 or 1) for the target variable
y = df['label'].values # This column already contains binary labels (0 or 1)

# No need to one-hot encode for sparse_categorical_crossentropy with integer labels
# If using binary_crossentropy and sigmoid output, keep as 1D array (0 or 1)

In [None]:
# Library
from sklearn.model_selection import train_test_split
import numpy as np

# Use the binary 'label' column for splitting
y = df['label'].values # Use the binary label column for splitting

# Ensure the stratify variable is a clean 1D binary array (0 or 1)
# Filter out any potential non-binary values if they exist
binary_labels = y
# If your 'label' column might contain values other than 0 and 1,
# you might need more robust filtering here. Assuming it only contains 0s and 1s
# or can be safely cast to int.
stratify_labels = binary_labels.astype(int)

# Check class counts before splitting
unique_labels, counts = np.unique(stratify_labels, return_counts=True)
print("Class counts for stratification:", dict(zip(unique_labels, counts)))

# Split - Use stratified split
X_train_lstm, X_test_lstm, y_train_lstm, y_test_lstm = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

print("X_train_lstm shape:", X_train_lstm.shape)
print("y_train_lstm shape:", y_train_lstm.shape)

In [None]:
# Libraries
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.optimizers import Adam

# LSTM Model
lstm_model = Sequential()
lstm_model.add(Embedding(input_dim=max_words, output_dim=64, input_length=max_len))
lstm_model.add(LSTM(64))
lstm_model.add(Dense(32, activation='relu'))
lstm_model.add(Dense(1, activation='sigmoid'))

lstm_model.compile(optimizer=Adam(0.001), loss='binary_crossentropy', metrics=['accuracy'])


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.optimizers import Adam

# Encode labels first
le = LabelEncoder()
y_encoded = le.fit_transform(df["label"])

# Convert text to sequences
sequences = tokenizer.texts_to_sequences(df["clean_text"])
X_seq = pad_sequences(sequences, maxlen=max_len)

In [None]:
# Train-test split
X_train_lstm, X_test_lstm, y_train_lstm, y_test_lstm = train_test_split(
    X_seq, y_encoded, test_size=0.2, random_state=42
)

print("X_train_lstm shape:", X_train_lstm.shape)
print("y_train_lstm shape:", y_train_lstm.shape)

In [None]:
# Define LSTM model
lstm_model = Sequential()
lstm_model.add(Embedding(input_dim=max_words, output_dim=64, input_length=max_len))
lstm_model.add(LSTM(64))
lstm_model.add(Dense(32, activation="relu"))
lstm_model.add(Dense(len(le.classes_), activation="softmax"))

lstm_model.compile(optimizer=Adam(0.001),
                   loss="sparse_categorical_crossentropy",
                   metrics=["accuracy"])

In [None]:
# ‚úÖ Train model
history = lstm_model.fit(
    X_train_lstm, y_train_lstm,
    epochs=10,
    batch_size=32,
    validation_split=0.1
)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import numpy as np

# Predict probabilities
y_pred_lstm_proba = lstm_model.predict(X_test_lstm)

# Convert probabilities to labels
if y_pred_lstm_proba.shape[-1] > 1:   # softmax (multi-class / one-hot)
    y_pred_lstm = np.argmax(y_pred_lstm_proba, axis=1)
else:  # sigmoid (binary)
    y_pred_lstm = (y_pred_lstm_proba > 0.5).astype(int)

# Convert y_test if one-hot encoded
if len(y_test_lstm.shape) > 1 and y_test_lstm.shape[-1] > 1:
    y_test_lstm_binary = np.argmax(y_test_lstm, axis=1)
else:
    y_test_lstm_binary = y_test_lstm

# üìä Evaluation
acc = accuracy_score(y_test_lstm_binary, y_pred_lstm)
prec = precision_score(y_test_lstm_binary, y_pred_lstm, average='weighted', zero_division=0)
rec = recall_score(y_test_lstm_binary, y_pred_lstm, average='weighted')
f1 = f1_score(y_test_lstm_binary, y_pred_lstm, average='weighted')
cm = confusion_matrix(y_test_lstm_binary, y_pred_lstm)

print("\nLSTM Model Performance:")
print("Accuracy:", round(acc, 4))
print("Precision:", round(prec, 4))
print("Recall:", round(rec, 4))
print("F1 Score:", round(f1, 4))
print("Confusion Matrix:\n", cm)


In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# ‚úÖ Step 1: Prepare data
texts = df["clean_text"].astype(str).tolist()

# Encode labels
le = LabelEncoder()
labels = le.fit_transform(df["label"])

# Tokenize
max_words = 10000
max_len = 100  # you can tune this
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
X = pad_sequences(sequences, maxlen=max_len)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, labels, test_size=0.2, random_state=0, stratify=labels
)

# ‚úÖ Step 2: Build BiLSTM model
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len))
model.add(Bidirectional(LSTM(64, return_sequences=False)))
model.add(Dropout(0.3))
model.add(Dense(64, activation="relu"))
model.add(Dense(len(le.classes_), activation="softmax"))  # multi-class

# Compile
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

# ‚úÖ Step 3: Train
history = model.fit(
    X_train, y_train,
    epochs=10,          # you can try 15‚Äì20 for better accuracy
    batch_size=32,
    validation_split=0.1,
    verbose=2
)

# ‚úÖ Step 4: Evaluate
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"\nBiLSTM Accuracy: {accuracy:.4f}")


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# ‚úÖ Predict
y_pred_probs = model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)

# ‚úÖ Evaluation
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average='weighted')
rec = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
cm = confusion_matrix(y_test, y_pred)

print("\nBiLSTM Performance:")
print(f"Accuracy:  {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall:    {rec:.4f}")
print(f"F1 Score:  {f1:.4f}")
print("\nConfusion Matrix:\n", cm)


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

model = Sequential()
model.add(Dense(64, activation='relu', input_dim=X_train.shape[1]))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=5, batch_size=32, validation_split=0.1, verbose=2)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense

# CNN Model
cnn_model = Sequential()
cnn_model.add(Embedding(input_dim=max_words, output_dim=64, input_length=max_len))
cnn_model.add(Conv1D(128, 5, activation="relu"))       # Convolutional layer
cnn_model.add(GlobalMaxPooling1D())                    # Pooling layer
cnn_model.add(Dense(32, activation="relu"))            # Fully connected layer
cnn_model.add(Dense(1, activation="sigmoid"))          # Output layer for binary classification

cnn_model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
cnn_model.fit(X_train, y_train, epochs=5, batch_size=64, validation_split=0.1)


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Step 1: Predict probabilities on test set
y_pred_prob = cnn_model.predict(X_test)

# Step 2: Convert probabilities to class labels (binary)
y_pred = (y_pred_prob > 0.5).astype(int)

# Step 3: Compute metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
cm = confusion_matrix(y_test, y_pred)

# Step 4: Print results
print("CNN Model Performance:")
print("Accuracy :", round(accuracy, 4))
print("Precision:", round(precision, 4))
print("Recall   :", round(recall, 4))
print("F1 Score :", round(f1, 4))
print("Confusion Matrix:\n", cm)


In [None]:
import pandas as pd

# ‚úÖ Store DL model results
acc_lstm, prec_lstm, rec_lstm, f1_lstm = 0.8827, 0.8834, 0.8827, 0.8828
acc_bilstm, prec_bilstm, rec_bilstm, f1_bilstm = 0.8622, 0.8628, 0.8622, 0.8623
acc_cnn, prec_cnn, rec_cnn, f1_cnn = 0.8418, 0.8472, 0.8418, 0.8419

# ‚úÖ Create a comparison table (Correct Order: LSTM ‚Üí BiLSTM ‚Üí CNN)
dl_results = {
    "Model": ["LSTM", "BiLSTM", "CNN"],
    "Accuracy": [acc_lstm, acc_bilstm, acc_cnn],
    "Precision": [prec_lstm, prec_bilstm, prec_cnn],
    "Recall": [rec_lstm, rec_bilstm, rec_cnn],
    "F1-Score": [f1_lstm, f1_bilstm, f1_cnn]
}

dl_results_df = pd.DataFrame(dl_results)
print("\nDeep Learning Model Comparison Table:")
print(dl_results_df)


In [None]:
from sklearn.metrics import roc_curve, auc, RocCurveDisplay, confusion_matrix

In [None]:
#Radar Chart (Performance Metrics: Accuracy, Precision, Recall, F1)
def plot_radar(model_name, accuracy, precision, recall, f1):
    labels = ['Accuracy', 'Precision', 'Recall', 'F1 Score']
    stats = [accuracy, precision, recall, f1]

    angles = np.linspace(0, 2 * np.pi, len(labels), endpoint=False).tolist()
    stats += stats[:1]  # close the circle
    angles += angles[:1]

    fig, ax = plt.subplots(figsize=(6,6), subplot_kw=dict(polar=True))
    ax.plot(angles, stats, 'o-', linewidth=2, label=model_name)
    ax.fill(angles, stats, alpha=0.25)
    ax.set_thetagrids(np.degrees(angles[:-1]), labels)
    ax.set_ylim(0,1)
    plt.title(f'{model_name} Performance Radar Chart')
    plt.legend(loc='upper right')
    plt.show()

In [None]:
plot_radar("Logistic Regression", 0.85, 0.83, 0.82, 0.825)


In [None]:
import numpy as np
import matplotlib.pyplot as plt

# ‚úÖ Model names & results ‚Äî directly use your computed variables
models = ["Logistic Regression", "Random Forest", "Naive Bayes", "SVM", "XGBoost", "LSTM", "BiLSTM", "CNN"]

accuracy  = [acc_log, acc_rf, acc_nb, acc_svm, acc_xgb, acc_lstm, acc_bilstm, acc_cnn]
precision = [prec_log, prec_rf, prec_nb, prec_svm, prec_xgb, prec_lstm, prec_bilstm, prec_cnn]
recall    = [rec_log, rec_rf, rec_nb, rec_svm, rec_xgb, rec_lstm, rec_bilstm, rec_cnn]
f1        = [f1_log, f1_rf, f1_nb, f1_svm, f1_xgb, f1_lstm, f1_bilstm, f1_cnn]

# ‚úÖ Combine metrics dynamically
metrics = [accuracy, precision, recall, f1]
metric_labels = ["Accuracy", "Precision", "Recall", "F1 Score"]

# ‚úÖ Calculate angles
angles = np.linspace(0, 2 * np.pi, len(metric_labels), endpoint=False).tolist()
angles += angles[:1]

# ‚úÖ Radar chart
fig, ax = plt.subplots(figsize=(8, 8), subplot_kw=dict(polar=True))

for i, model in enumerate(models):
    # Skip models with missing results
    if None in [accuracy[i], precision[i], recall[i], f1[i]]:
        continue

    values = [m[i] for m in metrics]
    values += values[:1]
    ax.plot(angles, values, label=model)
    ax.fill(angles, values, alpha=0.1)

# ‚úÖ Set axis labels dynamically
ax.set_xticks(angles[:-1])
ax.set_xticklabels(metric_labels)

plt.title("Model Performance Comparison (Auto-Updated Radar Chart)", size=14)
plt.legend(loc="upper right", bbox_to_anchor=(1.3, 1.1))
plt.show()


In [None]:
#ROC Curve (Binary Classification)
from sklearn.metrics import roc_curve, auc

def plot_roc(y_true, y_scores, model_name):
    fpr, tpr, thresholds = roc_curve(y_true, y_scores)
    roc_auc = auc(fpr, tpr)

    plt.figure(figsize=(6,6))
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
    plt.plot([0,1], [0,1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0,1.0])
    plt.ylim([0.0,1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve - {model_name}')
    plt.legend(loc='lower right')
    plt.show()


In [None]:
# ‚úÖ Make sure you are using TF-IDF X_test, not the LSTM one
# Re-run your TF-IDF transformation and train-test split before ROC curve
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1,2))
X_tfidf = tfidf.fit_transform(df["clean_text"])

X_train_lr, X_test_lr, y_train_lr, y_test_lr = train_test_split(
    X_tfidf, df["label"], test_size=0.2, random_state=0
)

# Refit Logistic Regression if needed
lr_model.fit(X_train_lr, y_train_lr)

# ‚úÖ Get probability scores (now dimensions match)
y_scores_lr = lr_model.predict_proba(X_test_lr)[:, 1]
y_true_lr_roc_binary = y_test_lr

# ‚úÖ ROC Curve
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

fpr, tpr, thresholds = roc_curve(y_true_lr_roc_binary, y_scores_lr)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8,6))
plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC curve (AUC = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='gray', lw=1, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - Logistic Regression')
plt.legend(loc='lower right')
plt.show()


In [None]:
#For LSTM:
# Ensure y_test_lstm is in a single column binary format if it's one-hot encoded
if len(y_test_lstm.shape) > 1 and y_test_lstm.shape[-1] > 1:
    y_test_lstm_binary = np.argmax(y_test_lstm, axis=1)
elif len(y_test_lstm.shape) == 1 and np.all(np.isin(y_test_lstm, [0, 1])):
     y_test_lstm_binary = y_test_lstm # Already in 1D binary format
else:
    # If neither of the above, try to convert to binary assuming it's a multiclass integer array
    # This might be needed if the previous steps resulted in a 1D integer array of multiple classes
    print("Attempting to convert multiclass integer labels to binary (0 or 1)...")
    # Assuming 1 is the positive class and others are negative
    y_test_lstm_binary = (y_test_lstm == 1).astype(int)
    print("Unique values after attempted conversion:", np.unique(y_test_lstm_binary))


# Ensure y_scores_lstm are the probabilities for the positive class
# Assuming the LSTM model outputs probabilities for each class in a one-hot encoded manner
y_scores_lstm = lstm_model.predict(X_test_lstm)[:, 1] # Get probabilities for the positive class (assuming it's index 1)

print("Shape of y_test_lstm_binary:", y_test_lstm_binary.shape)
print("Unique values in y_test_lstm_binary:", np.unique(y_test_lstm_binary))
print("Shape of y_scores_lstm:", y_scores_lstm.shape)
print("First 5 values of y_scores_lstm:", y_scores_lstm[:5])

plot_roc(y_test_lstm_binary, y_scores_lstm, "LSTM")

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

def plot_roc_curve(model, X_test, y_test, model_name):
    # ‚úÖ Convert y_test to pandas Series for apply()
    y_test_series = pd.Series(y_test)

    # ‚úÖ Convert multiclass labels to binary (functional = 1, non-functional = 0)
    functional_types = ['FR', 'F']
    y_test_binary = y_test_series.apply(lambda x: 1 if x in functional_types else 0)

    # ‚úÖ Get probability scores
    if hasattr(model, "predict_proba"):
        y_scores = model.predict_proba(X_test)[:, 1]
    else:
        y_scores = model.decision_function(X_test)

    # ‚úÖ Sanity check
    if len(y_test_binary) != len(y_scores):
        raise ValueError("True labels and predicted scores must have the same number of samples.")

    # ‚úÖ ROC curve calculation
    fpr, tpr, _ = roc_curve(y_test_binary, y_scores)
    roc_auc = auc(fpr, tpr)

    plt.plot(fpr, tpr, lw=2, label=f"{model_name} (AUC={roc_auc:.2f})")


# ‚úÖ Train models (with max_iter fix for LogisticRegression)
lr_model = LogisticRegression(max_iter=500).fit(X_train, y_train)
rf_model = RandomForestClassifier().fit(X_train, y_train)

# ‚úÖ Plot ROC curves
plt.figure(figsize=(8, 6))
plot_roc_curve(lr_model, X_test, y_test, "Logistic Regression")
plot_roc_curve(rf_model, X_test, y_test, "Random Forest")

plt.plot([0, 1], [0, 1], "k--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve Comparison")
plt.legend()
plt.show()


In [None]:
print(set(y_test_bow))
print(set(y_pred_lr))


In [None]:
#Confusion Matrix
def plot_confusion(y_true, y_pred, model_name):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(5,4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Non-functional','Functional'], yticklabels=['Non-functional','Functional'])
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.title(f'Confusion Matrix - {model_name}')
    plt.show()


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

# Encode labels
le = LabelEncoder()
y_encoded = le.fit_transform(df["label"])   # "Functional", "Non-Functional", etc. ko numbers me convert

# Split data
X_train_bow, X_test_bow, y_train_bow, y_test_bow = train_test_split(
    X_bow, y_encoded, test_size=0.2, random_state=42
)

# Train Logistic Regression model
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_bow, y_train_bow)

# Predictions
y_pred_lr = lr.predict(X_test_bow)

#  Confusion Matrix
cm = confusion_matrix(y_test_bow, y_pred_lr)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=le.classes_)
disp.plot(cmap="Blues")
plt.title("Confusion Matrix - Logistic Regression")
plt.show()


In [None]:
plot_confusion(y_test_lstm, y_pred_lstm, "LSTM")

In [None]:
#Prediction Pipeline Function
from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words="english")

# Fit + Transform on training data
X_tfidf = tfidf_vectorizer.fit_transform(df["clean_text"])

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y_encoded, test_size=0.2, random_state=42)

log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)

def predict_text(text, model, vectorizer, label_encoder):
    seq = vectorizer.transform([text])
    pred = model.predict(seq)
    return label_encoder.inverse_transform(pred)[0]

sample_text = "This is a test requirement"
print("Prediction:", predict_text(sample_text, log_reg, tfidf_vectorizer, le))


In [None]:
import pickle

df = pd.read_csv("software_requirements_extended.csv")
df["clean_text"] = df["Requirement"].astype(str)

tfidf = TfidfVectorizer(stop_words="english", max_features=5000)
X_tfidf = tfidf.fit_transform(df["clean_text"])

with open("tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(tfidf, f)

print("‚úÖ TF-IDF Vectorizer saved")


In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df["label"] = le.fit_transform(df["Type"])   # ya Requirement


with open("label_encoder.pkl", "wb") as f:
    pickle.dump(le, f)

print("‚úÖ Label Encoder saved")


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import pickle

X_train, X_test, y_train, y_test = train_test_split(X_tfidf, df["label"], test_size=0.2, random_state=42)

log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)

with open("logistic_regression.pkl", "wb") as f:
    pickle.dump(log_reg, f)

print("‚úÖ Logistic Regression model saved")


In [None]:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()
nb.fit(X_train, y_train)

with open("naive_bayes.pkl", "wb") as f:
    pickle.dump(nb, f)

print("‚úÖ Naive Bayes model saved")


In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(X_train, y_train)

with open("random_forest.pkl", "wb") as f:
    pickle.dump(rf, f)

print("‚úÖ Random Forest model saved")


In [None]:
from sklearn.svm import LinearSVC

svm = LinearSVC()
svm.fit(X_train, y_train)

with open("svm.pkl", "wb") as f:
    pickle.dump(svm, f)

print("‚úÖ SVM model saved")


In [None]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier(use_label_encoder=False, eval_metric="mlogloss")
xgb_model.fit(X_train, y_train)

with open("xgb_model.pkl", "wb") as f:
    pickle.dump(xgb_model, f)

print("‚úÖ XGBoost model saved")


In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_words = 2000
max_len = 50

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(df["clean_text"])

with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

print("‚úÖ Tokenizer saved")


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.optimizers import Adam

# LSTM Model
lstm_model = Sequential()
lstm_model.add(Embedding(input_dim=max_words, output_dim=64, input_length=max_len))
lstm_model.add(LSTM(64))
lstm_model.add(Dense(32, activation="relu"))
lstm_model.add(Dense(1, activation="sigmoid"))  # Binary classification

lstm_model.compile(optimizer=Adam(0.001),
                   loss="binary_crossentropy",
                   metrics=["accuracy"])

# Train model (example)
history = lstm_model.fit(
    X_train, y_train,
    epochs=5,
    batch_size=32,
    validation_split=0.1
)

# ‚úÖ Save model in .keras format
lstm_model.save("lstm_model.keras")

print("‚úÖ LSTM model saved as lstm_model.keras")


In [None]:
# Example: BiLSTM model variable
bilstm_model = Sequential()
bilstm_model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len))
bilstm_model.add(Bidirectional(LSTM(64, return_sequences=False)))
bilstm_model.add(Dropout(0.3))
bilstm_model.add(Dense(64, activation="relu"))
bilstm_model.add(Dense(len(le.classes_), activation="softmax"))

bilstm_model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])


In [None]:
# Save trained models
cnn_model.save("cnn_model.keras")
print("‚úÖ CNN model saved")

bilstm_model.save("bilstm_model.keras")
print("‚úÖ BiLSTM model saved")


# Verify files exist
import os
print("Files in current directory:", os.listdir())

# Download to local machine
from google.colab import files
files.download("cnn_model.keras")
files.download("bilstm_model.keras")


**Gen AI**

In [None]:

# STEP 0: Install dependencies
# ----------------------------
!pip install -q transformers pandas openpyxl reportlab tqdm coverage pytest sentencepiece matplotlib

# ----------------------------
# STEP 1: Imports
# ----------------------------
import pandas as pd, os, re, tempfile, subprocess, traceback
from tqdm import tqdm
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Image
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.lib.pagesizes import A4
import matplotlib.pyplot as plt
from transformers import pipeline, set_seed
from google.colab import files

set_seed(42)

# ----------------------------
# STEP 2: Upload dataset
# ----------------------------
print("‚û°Ô∏è Upload your CSV file (software_requirements_extended.csv).")
uploaded = files.upload()
CSV_FILE = list(uploaded.keys())[0]
print("Uploaded:", CSV_FILE)

# ----------------------------
# STEP 3: Load dataset
# ----------------------------
df = pd.read_csv(CSV_FILE)
if 'Requirement' in df.columns:
    req_col = 'Requirement'
elif 'requirement' in df.columns:
    req_col = 'requirement'
else:
    text_cols = [c for c in df.columns if df[c].dtype == object]
    req_col = text_cols[0]
print("Using requirement column:", req_col)
df = df.dropna(subset=[req_col])
print("Total requirements:", len(df))

# ----------------------------
# STEP 4: Load models
# ----------------------------
CODE_MODEL = "Salesforce/codegen-350M-multi"
TEST_MODEL = "google/flan-t5-small"
print("‚è≥ Loading models (may take 30-90s)...")
code_generator = pipeline("text-generation", model=CODE_MODEL, max_new_tokens=200, temperature=0.2)
test_generator = pipeline("text2text-generation", model=TEST_MODEL, max_length=128, do_sample=False)
print("‚úÖ Models loaded.")

# ----------------------------
# STEP 5: Sanitizer helpers
# ----------------------------
import re
def sanitize_code(s):
    if s is None: return ""
    s = re.sub(r"```(?:python)?", "", s)
    s = re.sub(r"https?://\S+", "", s)
    s = re.sub(r"(^\s*#.*\n)+", "", s, flags=re.M)
    s = re.split(r"(?i)explanation:|answer:|output:", s)[0]
    m = re.search(r"(def\s+\w+\s*\(.*\):[\s\S]*$)", s)
    if m:
        return m.group(1).strip()
    return s.strip()

# ----------------------------
# STEP 6: Core generate+test
# ----------------------------
def generate_and_validate(requirement, timeout=25):
    code_prompt = f"Respond ONLY with valid Python code. Requirement:\n{requirement}"
    code_raw = code_generator(code_prompt)[0]['generated_text']
    code = sanitize_code(code_raw)
    if len(code.strip()) == 0:
        code = "# ERROR: empty generation"

    test_prompt = f"Write 1 pytest unit test function(s) for the following code. Only output the test functions:\n{code}"
    test_raw = test_generator(test_prompt)[0]['generated_text']
    tests = sanitize_code(test_raw)
    if len(tests.strip()) == 0:
        tests = """
def test_module_imports():
    import generated_code
    assert True
"""

    with tempfile.TemporaryDirectory() as tmpdir:
        code_path = os.path.join(tmpdir, "generated_code.py")
        test_path = os.path.join(tmpdir, "test_generated_code.py")
        with open(code_path, "w", encoding="utf-8") as f: f.write(code)
        with open(test_path, "w", encoding="utf-8") as f:
            f.write("import pytest\nfrom generated_code import *\n\n")
            f.write(tests)

        passed, failed, coverage_pct = 0, 0, "0%"
        try:
            subprocess.run(["coverage", "erase"], check=False, cwd=tmpdir)
            proc = subprocess.run(
                ["coverage", "run", "-m", "pytest", "-q", test_path],
                capture_output=True, text=True, cwd=tmpdir, timeout=timeout
            )
            cov = subprocess.run(["coverage", "report", "-m"], capture_output=True, text=True, cwd=tmpdir)
            out = proc.stdout + proc.stderr
            m_pass = re.search(r"(\d+)\s+passed", out)
            m_fail = re.search(r"(\d+)\s+failed", out)
            passed = int(m_pass.group(1)) if m_pass else (1 if "1 passed" in out else 0)
            failed = int(m_fail.group(1)) if m_fail else (1 if "failed" in out else 0)
            match = re.search(r"generated_code\.py\s+\d+\s+\d+\s+(\d+%)", cov.stdout)
            if match:
                coverage_pct = match.group(1)
        except subprocess.TimeoutExpired:
            passed, failed, coverage_pct = 0, 1, "0%"
        except Exception as e:
            tb = traceback.format_exc()
            code += "\n# RUNTIME EXCEPTION:\n" + tb
            passed, failed, coverage_pct = 0, 1, "0%"

    return {
        "requirement": requirement,
        "generated_code": code,
        "generated_tests": tests,
        "pytest_stdout": proc.stdout if 'proc' in locals() else "",
        "pytest_stderr": proc.stderr if 'proc' in locals() else "",
        "coverage": coverage_pct,
        "passed": passed,
        "failed": failed
    }

# ----------------------------
# STEP 7: Run on dataset (sample or full)
# ----------------------------
SAMPLE_N = 5
sample_reqs = df[req_col].sample(SAMPLE_N, random_state=42).tolist()
results = []
for r in sample_reqs:
    print("Processing:", r[:80], "...")
    res = generate_and_validate(r, timeout=25)
    results.append(res)
    print(" -> done | passed:", res['passed'], "| coverage:", res['coverage'])

results_df = pd.DataFrame(results)
results_df['status'] = results_df['passed'].apply(lambda x: 'Pass' if x > 0 else 'Fail')
results_df.to_csv("generated_code_results.csv", index=False)
results_df.to_excel("generated_code_results.xlsx", index=False)

# ----------------------------
# STEP 8: Graphs (Optimized)
# ----------------------------
import matplotlib.pyplot as plt
import os
from google.colab import files

# Ensure numeric coverage
coverage_values = results_df['coverage'].str.replace('%','').astype(int)

# 1Ô∏è‚É£ Pass/Fail Bar Chart
plt.figure(figsize=(6,4))
results_df['status'].value_counts().plot(kind='bar', color=['green','red'])
plt.title("Pass vs Fail Counts")
plt.xlabel("Status"); plt.ylabel("Count")
plt.tight_layout()
plt.savefig("pass_fail_bar.png")
plt.close()  # Close figure to avoid overlap

# 2Ô∏è‚É£ Coverage Histogram
plt.figure(figsize=(6,4))
plt.hist(coverage_values, bins=10, edgecolor='black', color='skyblue')
plt.title("Coverage Distribution")
plt.xlabel("Coverage %"); plt.ylabel("Count")
plt.tight_layout()
plt.savefig("coverage_hist.png")
plt.close()

# 3Ô∏è‚É£ Pie Chart (Pass/Fail Ratio)
plt.figure(figsize=(5,5))
results_df['status'].value_counts().plot(kind='pie', autopct='%1.1f%%', startangle=90, colors=['green','red'])
plt.title("Pass/Fail Ratio")
plt.ylabel("")
plt.tight_layout()
plt.savefig("pass_fail_pie.png")
plt.close()

print("‚úÖ All graphs saved as PNG.")

# ----------------------------
# STEP 9: PDF Report (Optimized)
# ----------------------------
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Image
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.lib.pagesizes import A4

pdf_path = "proposal_and_results_summary.pdf"
styles = getSampleStyleSheet()
doc = SimpleDocTemplate(pdf_path, pagesize=A4)
elements = []

elements.append(Paragraph("Generative AI ‚Äî Code & Test Generation Report", styles["Title"]))
elements.append(Spacer(1,12))
elements.append(Paragraph(f"Dataset: {CSV_FILE}", styles["Normal"]))
elements.append(Paragraph(f"Requirements processed: {len(results)}", styles["Normal"]))
elements.append(Paragraph(f"Pass Count: {results_df['passed'].sum()}", styles["Normal"]))
elements.append(Paragraph(f"Fail Count: {results_df['failed'].sum()}", styles["Normal"]))
elements.append(Spacer(1,12))

elements.append(Paragraph("Visual Insights", styles["Heading2"]))
for img in ["pass_fail_bar.png", "coverage_hist.png", "pass_fail_pie.png"]:
    if os.path.exists(img):
        elements.append(Image(img, width=400, height=250))
        elements.append(Spacer(1,12))

elements.append(Paragraph("Per-requirement Summary:", styles["Heading2"]))
for r in results:
    brief = f"- {r['requirement'][:80]}... | Passed: {r['passed']} | Coverage: {r['coverage']}"
    elements.append(Paragraph(brief, styles["Code"]))
elements.append(Spacer(1,12))

doc.build(elements)
print("‚úÖ PDF report generated:", pdf_path)

# ----------------------------
# STEP 10: Download Files
# ----------------------------
for f in ["generated_code_results.csv","generated_code_results.xlsx",pdf_path,
          "pass_fail_bar.png","coverage_hist.png","pass_fail_pie.png"]:
    if os.path.exists(f):
        files.download(f)

print("‚úÖ All files ready for download!")

