In [None]:
# Install the BERTopic library for topic modeling and clustering user reviews
!pip install bertopic

In [None]:
# Import the BERTopic model 
from bertopic import BERTopic

In [None]:
# Load the user review dataset into a pandas DataFrame for further processing
Cluster_Data = pd.read_csv('/kaggle/input/mediumfull/dataset (1).csv')

# Pre-processing the dataset

In [None]:
# Download the WordNet corpus needed for lemmatization with NLTK
!python3 -m nltk.downloader wordnet
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

In [None]:
# Define a dictionary for expanding common English contractions in text
contractions_dict = {
    "ain't": "am not / are not / is not / has not / have not",
    "aren't": "are not",
    "can't": "cannot",
    "could've": "could have",
    "couldn't": "could not",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he had / he would",
    "he'll": "he shall / he will",
    "he's": "he has / he is",
    "I'd": "I had / I would",
    "I'll": "I shall / I will",
    "I'm": "I am",
    "I've": "I have",
    "isn't": "is not",
    "it's": "it has / it is",
    "let's": "let us",
    "might've": "might have",
    "mightn't": "might not",
    "must've": "must have",
    "mustn't": "must not",
    "needn't": "need not",
    "oughtn't": "ought not",
    "shan't": "shall not",
    "she'd": "she had / she would",
    "she'll": "she shall / she will",
    "she's": "she has / she is",
    "should've": "should have",
    "shouldn't": "should not",
    "that's": "that has / that is",
    "there's": "there has / there is",
    "they'd": "they had / they would",
    "they'll": "they shall / they will",
    "they're": "they are",
    "they've": "they have",
    "wasn't": "was not",
    "we'd": "we had / we would",
    "we'll": "we shall / we will",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what shall / what will",
    "what's": "what has / what is",
    "when's": "when has / when is",
    "where's": "where has / where is",
    "who'd": "who had / who would",
    "who'll": "who shall / who will",
    "who's": "who has / who is",
    "won't": "will not",
    "would've": "would have",
    "wouldn't": "would not",
    "you'd": "you had / you would",
    "you'll": "you shall / you will",
    "you're": "you are",
    "you've": "you have"
}

In [None]:
# Apply contraction expansion to the 'content' column to standardize text for better analysis
def expand_contractions(text, contractions_dict):
    """
    This function replaces contractions in a given text with their expansions
    based on the provided contractions dictionary.
    """
    words = text.split()
    new_words = []
    for word in words:
        if word.lower() in contractions_dict:
            new_words.extend(contractions_dict[word.lower()].split("/"))
        else:
            new_words.append(word)
    return " ".join(new_words)

Cluster_Data["content"] = Cluster_Data["content"].apply(lambda x: expand_contractions(x, contractions_dict))

In [None]:
# Import necessary libraries for data manipulation, text preprocessing, and lemmatization
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [None]:
# Set up stopwords and lemmatizer
stop = set(stopwords.words('english'))
lemma = WordNetLemmatizer()

In [None]:
# Drop unnecessary columns
columns_to_drop = ['score', 'appVersion', 'repliedAt', 'replyContent', 'at', 'reviewCreatedVersion', 'thumbsUpCount']
Cluster_Data = Cluster_Data.drop(columns=columns_to_drop, errors='ignore')
print(f"Dropped unnecessary columns. Remaining columns: {list(Cluster_Data.columns)}")

# Drop rows where 'sentiment' is equal to 'POSITIVE'
initial_count = len(Cluster_Data)
Cluster_Data = Cluster_Data[Cluster_Data['sentiment'] != 'POSITIVE']
removed_count = initial_count - len(Cluster_Data)
print(f"Removed {removed_count} positive sentiment reviews. Remaining reviews: {len(Cluster_Data)}")

# Drop rows where the number of words in 'content' is less than or equal to 3
initial_count = len(Cluster_Data)
Cluster_Data = Cluster_Data[Cluster_Data['content'].str.split().apply(len) > 3]
removed_count = initial_count - len(Cluster_Data)
print(f"Removed {removed_count} short reviews (<= 3 words). Remaining reviews: {len(Cluster_Data)}")

# Convert reviews to lowercase and split into words
Cluster_Data['NewReviews'] = Cluster_Data['content'].str.lower().str.split()
print("Converted content to lowercase and split into words.")

# Remove stopwords
initial_count = len(Cluster_Data)
Cluster_Data['NewReviews'] = Cluster_Data['NewReviews'].apply(lambda x: [item for item in x if item not in stop])
print("Removed stopwords from reviews.")

# Lemmatize the reviews
Cluster_Data['Cleaned_reviews'] = Cluster_Data['NewReviews'].apply(
    lambda x: ''.join([lemma.lemmatize(re.sub('[^A-Za-z]', ' ', word)) for word in x]).strip()
)
print("Lemmatized the reviews.")

# Remove duplicate reviews
initial_count = len(Cluster_Data)
Cluster_Data = Cluster_Data.drop_duplicates(subset=['Cleaned_reviews'], keep='first')
removed_count = initial_count - len(Cluster_Data)
print(f"Removed {removed_count} duplicate reviews. Remaining reviews: {len(Cluster_Data)}")

# Final DataFrame output
print(f"Final dataset contains {len(Cluster_Data)} reviews.")

In [None]:
# Display the first few rows of the dataset to verify preprocessing steps
Cluster_Data.head()

In [None]:
# Prepare and load the cleaned review text data into a list for clustering 
reviews = Cluster_Data['Cleaned_reviews'].dropna().tolist()  # Ensure no NaN values
print(f"Final dataset contains {len(reviews)} reviews.")

# Clustering the reviews

In [None]:
# Initialize and fit BERTopic model on the review data to generate clusters 
topic_model = BERTopic()
topics, probs = topic_model.fit_transform(reviews)

In [None]:
# Add the generated topic labels to the original dataset and save to a CSV file
Cluster_Data['Topic'] = topics
Cluster_Data.to_csv('medium_dataset_with_topics.csv', index=False)
print(Cluster_Data.head())

In [None]:
# Display the topic summary
topic_model.get_topic_info()

In [None]:
# Install additional libraries for topic visualization and dimensionality reduction
!pip install -U plotly pandas umap-learn

In [None]:
# Visualize the topics in 2D space
topic_model.visualize_topics()

# Visualize word distributions per topic
topic_model.visualize_barchart()

# Visualize the document probabilities
topic_model.visualize_documents(reviews)

In [None]:
# Create and save an HTML visualization of the topic model to review topic distributions
topic_model.visualize_topics().write_html("topics_visualization.html")

In [None]:
# Display the frequency of each topic identified by the BERTopic model
topic_freq = topic_model.get_topic_freq()
pd.set_option('display.max_rows', None)  # Display all rows
print(topic_freq)

In [None]:
# Import libraries for dimensionality reduction and silhouette score calculation
import umap
from sklearn.metrics import silhouette_score

In [None]:
# Extract the embeddings and topics
embeddings = topic_model.embedding_model.embedding_model.encode(reviews, show_progress_bar=True)
cluster_labels = topic_model.get_document_info(reviews)['Topic']

# Apply UMAP to reduce dimensionality for visualization and evaluation
umap_embeddings = umap.UMAP(n_components=3, random_state=42).fit_transform(embeddings)

# Remove outliers (-1 labels)
valid_indices = [i for i, label in enumerate(cluster_labels) if label != -1]
filtered_embeddings = umap_embeddings[valid_indices]
filtered_labels = [cluster_labels[i] for i in valid_indices]

# Calculate the Silhouette Score to assess clustering quality
silhouette_avg = silhouette_score(filtered_embeddings, filtered_labels)
print(f"Silhouette Score (UMAP Embeddings): {silhouette_avg}")

In [None]:
# Sort the DataFrame by the 'Topic' column
sorted_df = Cluster_Data.sort_values(by='Topic', ascending=True)

# Save the sorted dataset to a new CSV file
sorted_df.to_csv('medium_sorted_reviews_by_topic.csv', index=False)

In [None]:
# Load the sorted dataset
import pandas as pd
Cluster_Data = pd.read_csv('/kaggle/working/medium_sorted_reviews_by_topic.csv')

In [None]:
# Filter out noise points (-1)
filtered_data = Cluster_Data[Cluster_Data['Topic'] != -1]

# Save the filtered data
filtered_data.to_csv("filtered_dataset_without_noise.csv", index=False)

print(f"Number of remaining reviews: {len(filtered_data)}")

In [None]:
# Group by 'Topic' and sample 10% of each group for evaluation
sampled_data = (
    filtered_data.groupby('Topic', group_keys=False)
    .apply(lambda x: x.sample(frac=0.1, random_state=42))  # Adjust random_state for reproducibility if needed
)

# Save the sampled data to a CSV file
sampled_data.to_csv("sampled_10_percent_per_topic.csv", index=False)

print(f"Number of sampled reviews: {len(sampled_data)}")


In [None]:
# Split the sampled data into 4 equal parts for group evaluation
import numpy as np

split_data = np.array_split(sampled_data, 4)

# Save each part to a separate CSV file
for i, part in enumerate(split_data, start=1):
    part.to_csv(f"sampled_data_part_{i}.csv", index=False)
    print(f"Saved part {i} with {len(part)} reviews to 'sampled_data_part_{i}.csv'")


# Requirements Extraction

In [None]:
# Load the filtered dataset 
import pandas as pd
Cluster_Data = pd.read_csv("/kaggle/input/finalclusters/filtered_dataset_without_noise.csv")

# Display the first few rows of the dataset
Cluster_Data.head()

In [None]:
# Installing the latest version of the OpenAI library for generating requirements
pip install --upgrade openai

In [None]:
# Combine all content into a single string for each topic
clustered_reviews = Cluster_Data.groupby('Topic')['content'].apply(lambda x: ' '.join(x)).to_dict()

In [None]:
# Define a function to generate formal software requirements from grouped review text. It uses a prompt specifying detailed rules for requirement generation, and then add them to the dataset.
requirements_list = []

def generate_formal_requirements(reviews_text, topic_id):
    prompt = f"""
    Generate a list of concise and formal software requirements based on the following reviews (Topic ID: {topic_id}).
Each requirement should follow these rules:
1.⁠ ⁠*The subject* should be the software or a specific feature.
2.⁠ ⁠*The predicate* should describe a condition, action, or result, and must be:
    - Feasible
    - Necessary
    - Unambiguous
    - Testable
3.⁠ ⁠Use the following language conventions:
    - *Shall, **will, and **must* indicate mandatory requirements.
    - *May* and *should* indicate optional requirements.
4.⁠ ⁠The requirements should directly address the core functionality, performance, stability, and usability needs mentioned in the reviews.
5. The requirements should be distinct.
6.Avoid conflicting requirements.
7. All the mentioned requirements should be covered.

The requirements should be clearly written and easy to understand. Avoid including additional details, categories, or prioritization.

*Output Format*:
1.⁠ ⁠Clearly separate the *Representative Requirement* and *Individual Requirements* sections.
2.⁠ ⁠The format must be:

    Representative Requirement:
    - Requirement that represents the cluster.

    Individual Requirements:
    1. Requirement 1
    2. Requirement 2
    3. Requirement 3

Ensure the output strictly follows this format, even if the reviews focus on a single feature or topic. Avoid including additional commentary or rephrasing the structure.
Reviews:
{reviews_text}
    """

    response = client.chat.completions.create(
        model="gpt-4o", 
        messages=[
            {"role": "developer", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.3  # Reduced for deterministic responses
    )

    # Extract the content of the first choice's message
    return response.choices[0].message.content.strip()

In [None]:
#Process topics in batches (10 topics per batch) For each batch, generate requirements using the custom function 
clustered_reviews = Cluster_Data.groupby('Topic')['content'].apply(lambda x: " ".join(x)).to_dict()
batch_size = 10
topics = sorted(clustered_reviews.keys())

for i in range(0, len(topics), batch_size):
    batch_topics = topics[i:i + batch_size]
    
    # Create a new DataFrame for the batch
    batch_data = pd.DataFrame({
        "Topic": batch_topics,
        "Content": [clustered_reviews[topic] for topic in batch_topics],  # All reviews in one cell per cluster
        "Generated Requirements": [
            generate_formal_requirements(clustered_reviews[topic], topic) for topic in batch_topics
        ]
    })
    
    # Save the batch to a separate file
    batch_data.to_csv(f"clustered_reviews_with_requirements_batch_{i // batch_size + 1}.csv", index=False)

print("Generated requirements for every 10 clusters and saved each batch to separate files.")

In [None]:
#Merge these batches to one dataset (final dataset)
import os
import pandas as pd

# Path to the folder containing batch files
folder_path = "/kaggle/input/batches/batches"

# Initialize an empty list to store dataframes
dfs = []

# Loop through all files in the folder
for file in os.listdir(folder_path):
    if file.endswith(".csv"):  # Ensure only CSV files are processed
        file_path = os.path.join(folder_path, file)
        df = pd.read_csv(file_path)
        dfs.append(df)

# Concatenate all dataframes into one
merged_df = pd.concat(dfs, ignore_index=True)

# Sort the merged dataframe by the 'Topic' column
merged_df = merged_df.sort_values(by="Topic", ascending=True)

# Save the sorted dataframe to a new CSV file 
merged_df.to_csv("clusters_with_requirements.csv", index=False)

print("All batch files have been merged into 'clusters_with_requirements.csv'.")