<a href="https://colab.research.google.com/github/Surbhiiiiiii/Customer-Review-Analysis-Using-Topic-Modelling/blob/main/customer_review_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
import json
import nltk
# !pip install gensim

import gensim
from gensim import corpora
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import nltk
nltk.download('punkt_tab')
# Install dependencies (if running in a new environment)
# !pip install gensim nltk

# Download necessary NLP resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# OpenRouter API Key
OPENROUTER_API_KEY = ""

# Function to generate AI response using OpenRouter API
def generate_ai_response(prompt):
    response = requests.post(
        url="https://openrouter.ai/api/v1/chat/completions",
        headers={
            "Authorization": f"Bearer {OPENROUTER_API_KEY}",
            "Content-Type": "application/json",
        },
        data=json.dumps({
            "model": "mistralai/mixtral-8x7b-instruct",  # Choose a powerful OpenRouter model
            "messages": [{"role": "user", "content": prompt}]
        })
    )

    try:
        return response.json()["choices"][0]["message"]["content"]
    except KeyError:
        return "Error: Failed to retrieve AI response."

# Load negative sentiment reviews
file_path = '/content/yelp_labelled.txt'
with open(file_path, 'r', encoding='utf-8') as f:
    lines = f.readlines()

negative_reviews = [line.split('\t')[0] for line in lines if line.strip().endswith('0')]

# Preprocessing function
def preprocess(text):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalnum()]
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return tokens

# Preprocess negative feedback
processed_feedback = [preprocess(text) for text in negative_reviews if text.strip()]

# Create dictionary and corpus
dictionary = corpora.Dictionary(processed_feedback)
corpus = [dictionary.doc2bow(text) for text in processed_feedback]

# # Train LDA model
# num_topics = 5
# lda_model = gensim.models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10)

# # Topic descriptions
# topic_descriptions = {
#     0: "Poor product quality and usability issues.",
#     1: "Dissatisfaction with pricing and value for money.",
#     2: "Complaints about battery life and charging problems.",
#     3: "Issues with customer service and post-purchase support.",
#     4: "Concerns over product durability and long-term performance."
# }
# import gensim
# import gensim.corpora as corpora

# Assume 'corpus' and 'dictionary' are already prepared
num_topics = 5
lda_model = gensim.models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10)

# Dynamically generate topic descriptions
topic_descriptions = {}

for i, topic in lda_model.show_topics(num_topics=num_topics, num_words=5, formatted=False):
    top_words = ", ".join([word for word, _ in topic])  # Extract top words
    topic_descriptions[i] = f"Topic related to: {top_words}"

# Print generated topic descriptions
for topic_id, description in topic_descriptions.items():
    print(f"Topic {topic_id}: {description}")

print("\nExtracted Topics and AI Suggestions:\n")

for topic_id, topic_text in topic_descriptions.items():
    print(f"\n🔹 **Topic {topic_id}:** {topic_text}")

    prompt = f"""
    **Customer Complaints:** {topic_text}

    📌 **What are the top 3 problems customers face?**
    📌 **What specific changes should the company make?**
    📌 **How can the company prevent these issues permanently?**
    Keep responses **short and structured**.
    """

    ai_suggestion = generate_ai_response(prompt)
    print(f"💡 **AI Suggestion:** {ai_suggestion}\n")


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Topic 0: Topic related to: service, food, bad, place, slow
Topic 1: Topic related to: like, place, time, got, minute
Topic 2: Topic related to: back, probably, food, waited, soon
Topic 3: Topic related to: back, place, time, go, would
Topic 4: Topic related to: food, good, ever, way, never

Extracted Topics and AI Suggestions:


🔹 **Topic 0:** Topic related to: service, food, bad, place, slow
💡 **AI Suggestion:**  1. Top 3 problems customers face:
   - Poor service: Customers often complain about slow service, lack of staff attention, and unprofessional behavior from employees.
   - Inadequate food quality: Complaints about the taste, freshness, and presentation of food are common.
   - Unclean or unpleasant environment: Customers report issues with the cleanliness of facilities, uncomfortable seating, and poor ambiance.

2. Specific changes the company should make:
   - Improve customer service by training staff to be more attentive, responsive, and friendly.
   - Regularly inspect an

In [None]:
!pip uninstall -y numpy gensim pyLDAvis numba tensorflow tensorflow-text tf-keras cudf-cu12 cuml-cu12 dask-cuda distributed-ucxx-cu12 raft-dask-cu12 dask-cudf-cu12
!pip install numpy==1.26.4 numba==0.60.0 tensorflow==2.18.0 tensorflow-text==2.18.1 tf-keras==2.18.0
!pip install cudf-cu12==25.2 cuml-cu12==25.2 dask-cuda==25.2 distributed-ucxx-cu12==0.42 raft-dask-cu12==25.2 dask-cudf-cu12==25.2


Found existing installation: numpy 2.0.2
Uninstalling numpy-2.0.2:
  Successfully uninstalled numpy-2.0.2
[0mFound existing installation: numba 0.60.0
Uninstalling numba-0.60.0:
  Successfully uninstalled numba-0.60.0
Found existing installation: tensorflow 2.18.0
Uninstalling tensorflow-2.18.0:
  Successfully uninstalled tensorflow-2.18.0
Found existing installation: tensorflow-text 2.18.1
Uninstalling tensorflow-text-2.18.1:
  Successfully uninstalled tensorflow-text-2.18.1
Found existing installation: tf_keras 2.18.0
Uninstalling tf_keras-2.18.0:
  Successfully uninstalled tf_keras-2.18.0
Found existing installation: cudf-cu12 25.2.1
Uninstalling cudf-cu12-25.2.1:
  Successfully uninstalled cudf-cu12-25.2.1
Found existing installation: cuml-cu12 25.2.1
Uninstalling cuml-cu12-25.2.1:
  Successfully uninstalled cuml-cu12-25.2.1
Found existing installation: dask-cuda 25.2.0
Uninstalling dask-cuda-25.2.0:
  Successfully uninstalled dask-cuda-25.2.0
Found existing installation: distribu

In [None]:
import gradio as gr
import pandas as pd
import requests
import json
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import gensim
from gensim import corpora
import os

# Ensure required NLTK resources are available
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# OpenRouter API Key
OPENROUTER_API_KEY = ""
def generate_ai_response(prompt):
    response = requests.post(
        url="https://openrouter.ai/api/v1/chat/completions",
        headers={
            "Authorization": f"Bearer {OPENROUTER_API_KEY}",
            "Content-Type": "application/json",
        },
        data=json.dumps({
            "model": "mistralai/mixtral-8x7b-instruct",
            "messages": [{"role": "user", "content": prompt}]
        })
    )
    try:
        return response.json()["choices"][0]["message"]["content"]
    except KeyError:
        return "Error: Failed to retrieve AI response."

def preprocess(text):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return tokens

def analyze_trends(df):
    sns.set_style("whitegrid")
    plt.figure(figsize=(8, 5))
    ax = sns.countplot(x=df['Sentiment'], palette=['red', 'green'])
    ax.set_xticklabels(["Negative Reviews", "Positive Reviews"])
    plt.title("Review Sentiment Distribution")
    plt.savefig("sentiment_analysis.png")
    plt.close()
    return "Sentiment trend analysis completed. See the plot below.", "sentiment_analysis.png"

def process_input(file, text):
    df = None  # Initialize dataframe variable

    if file is not None:
        file_path =file  # Adjust path for Gradio
        file_extension = os.path.splitext(file_path)[1].lower()

        # Handle CSV File
        if file_extension == ".csv":
            df = pd.read_csv(file_path, encoding="utf-8", nrows=5000,on_bad_lines="skip")  # Load first 5000 rows

        # Handle Text File
        elif file_extension == ".txt":
            with open(file_path, "r", encoding="utf-8") as f:
                lines = f.readlines()

            # Parse text file (split by tab into 'Review' and 'Sentiment')
            data = [line.strip().split("\t") for line in lines if "\t" in line]
            df = pd.DataFrame(data, columns=["Review", "Sentiment"])
            df.columns = ["Review", "Sentiment"]
            df["Sentiment"] = df["Sentiment"].astype(int)  # Convert sentiment to int

        else:
            return "Unsupported file format. Please upload a CSV or TXT file."

    elif text:
        df = pd.DataFrame([[text, 0]], columns=["Review", "Sentiment"])

    else:
        return "Please provide either a file or a text review."

    # Preprocess reviews
    df["Processed_Review"] = df["Review"].astype(str).apply(preprocess)

    # 🛑 **Filter Only Negative Reviews (`Sentiment == 0`)**
    df_negative = df[df["Sentiment"] == 0]

    if df_negative.empty:
        return "No negative reviews found.", "Sentiment trend analysis completed.", "sentiment_analysis.png"

    # Topic Modeling on Negative Reviews
    dictionary = corpora.Dictionary(df_negative["Processed_Review"])
    corpus = [dictionary.doc2bow(review) for review in df_negative["Processed_Review"]]
    num_topics = 5
    lda_model = gensim.models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10)
    topic_descriptions = {i: ", ".join([word for word, _ in topic]) for i, topic in lda_model.show_topics(num_topics=num_topics, num_words=5, formatted=False)}

    # Sentiment Analysis Plot
    sentiment_analysis, plot_path = analyze_trends(df)

    # Generate AI Suggestions **Only for Negative Topics**
    results = []
    for topic_id, topic_text in topic_descriptions.items():
        prompt = f"Customer Complaints: {topic_text}\n\nTop 3 problems customers face?\nWhat changes should the company make?\nHow to prevent these issues permanently?"
        ai_suggestion = generate_ai_response(prompt)
        results.append(f"Topic {topic_id}: {topic_text}\nAI Suggestion: {ai_suggestion}\n")

    output_text = "\n".join(results)

    return output_text, sentiment_analysis, plot_path


with gr.Blocks(css="""
body {
    background-color: #fdf6ff;
    font-family: 'Comic Sans MS', cursive, sans-serif;
}
h2 {
    color: #7d5ba6;
    font-weight: bold;
}
.gr-button-primary {
    background: linear-gradient(135deg, #ffb6b9, #fae3d9);
    color: #333;
    border: none;
    font-weight: bold;
    border-radius: 8px;
}
.gr-button-primary:hover {
    background: linear-gradient(135deg, #fae3d9, #ffb6b9);
}
.container {
    padding: 25px;
    border-radius: 15px;
    background: #ffffff;
    box-shadow: 0 8px 20px rgba(0,0,0,0.1);
}
label {
    font-weight: bold;
    color: #5d3a9b;
}
textarea, input {
    border-radius: 10px !important;
    border: 1px solid #e0c3fc !important;
    padding: 10px !important;
    background: #fff5f9 !important;
}
""") as demo:
    with gr.Row():
        gr.Markdown("<h2 style='text-align:center;'>✨ Customer Review Analysis ✨</h2>")
    with gr.Row():
        with gr.Column(elem_classes="container"):
            file_input = gr.File(label="📂 Upload a CSV/TXT file with 'Review' and 'Sentiment'", type="filepath")
            text_input = gr.Textbox(label="💬 Or manually enter a review:", placeholder="E.g., The product quality was disappointing...", lines=3)
            submit_button = gr.Button("🎀 Generate Suggestions", variant="primary")

        with gr.Column(elem_classes="container"):
            output = gr.Textbox(label="🌸 AI Suggestions (Based on Topics)", interactive=False, lines=10)
            sentiment_output = gr.Textbox(label="📊 Sentiment Analysis Summary", interactive=False)
            trend_plot = gr.Image(label="📈 Sentiment Distribution Plot")

    submit_button.click(process_input, inputs=[file_input, text_input], outputs=[output, sentiment_output, trend_plot])

demo.launch()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://1a1434875e3e48e5a1.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [None]:
!pip install gradio
!pip install pandas
!pip install gensim

Collecting gradio
  Downloading gradio-5.29.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.10.0 (from gradio)
  Downloading gradio_client-1.10.0-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.8-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.6

In [None]:
import gradio as gr
import pandas as pd
import requests
import json
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import gensim
from gensim import corpora
import os

# Ensure required NLTK resources are available
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# OpenRouter API Key
OPENROUTER_API_KEY = ""

def generate_ai_response(prompt):
    response = requests.post(
        url="https://openrouter.ai/api/v1/chat/completions",
        headers={
            "Authorization": f"Bearer {OPENROUTER_API_KEY}",
            "Content-Type": "application/json",
        },
        data=json.dumps({
            "model": "mistralai/mixtral-8x7b-instruct",
            "messages": [{"role": "user", "content": prompt}]
        })
    )
    try:
        return response.json()["choices"][0]["message"]["content"]
    except KeyError:
        return "Error: Failed to retrieve AI response."

def preprocess(text):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return tokens

def analyze_trends(df):
    sentiment_counts = df['Sentiment'].value_counts().sort_index()
    total_reviews = sentiment_counts.sum()
    negative = sentiment_counts.get(0, 0)
    positive = sentiment_counts.get(1, 0)
    negative_pct = (negative / total_reviews) * 100 if total_reviews > 0 else 0
    positive_pct = (positive / total_reviews) * 100 if total_reviews > 0 else 0

    summary = (
        f"Total Reviews: {total_reviews}\n"
        f"Positive Reviews: {positive} ({positive_pct:.2f}%)\n"
        f"Negative Reviews: {negative} ({negative_pct:.2f}%)"
    )

    sns.set_style("whitegrid")
    plt.figure(figsize=(8, 5))
    ax = sns.countplot(x=df['Sentiment'], palette=['red', 'green'])
    ax.set_xticklabels(["Negative Reviews", "Positive Reviews"])
    plt.title("Review Sentiment Distribution")
    plt.savefig("sentiment_analysis.png")
    plt.close()

    return summary, "sentiment_analysis.png"


def process_input(file, text, custom_prompt):
    df = None  # Initialize dataframe variable

    if file is not None:
        file_path = file  # Adjust path for Gradio
        file_extension = os.path.splitext(file_path)[1].lower()

        # Handle CSV File
        if file_extension == ".csv":
            df = pd.read_csv(file_path, encoding="utf-8", nrows=5000, on_bad_lines="skip")  # Load first 5000 rows

        # Handle Text File
        elif file_extension == ".txt":
            with open(file_path, "r", encoding="utf-8") as f:
                lines = f.readlines()
            data = [line.strip().split("\t") for line in lines if "\t" in line]
            df = pd.DataFrame(data, columns=["Review", "Sentiment"])
            df["Sentiment"] = df["Sentiment"].astype(int)

        else:
            return "Unsupported file format. Please upload a CSV or TXT file.", "", None

    elif text:
        df = pd.DataFrame([[text, 0]], columns=["Review", "Sentiment"])

    else:
        return "Please provide either a file or a text review.", "", None

    df["Processed_Review"] = df["Review"].astype(str).apply(preprocess)

    df_negative = df[df["Sentiment"] == 0]

    if df_negative.empty:
        return "No negative reviews found.", "Sentiment trend analysis completed.", "sentiment_analysis.png"

    dictionary = corpora.Dictionary(df_negative["Processed_Review"])
    corpus = [dictionary.doc2bow(review) for review in df_negative["Processed_Review"]]
    num_topics = 5
    lda_model = gensim.models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10)
    topic_descriptions = {
        i: ", ".join([word for word, _ in topic])
        for i, topic in lda_model.show_topics(num_topics=num_topics, num_words=5, formatted=False)
    }

    sentiment_analysis, plot_path = analyze_trends(df)

    results = []
    for topic_id, topic_text in topic_descriptions.items():
        prompt = f"Topic: {topic_text}\n{custom_prompt}"
        ai_suggestion = generate_ai_response(prompt)
        results.append(f"Topic {topic_id}: {topic_text}\nAI Suggestion: {ai_suggestion}\n")

    output_text = "\n".join(results)
    return output_text, sentiment_analysis, plot_path

with gr.Blocks(css="""
body {
    background: linear-gradient(135deg, #f7a1d7, #f0c7fa);
    font-family: 'Comic Sans MS', cursive, sans-serif;
    background-attachment: fixed;
    background-size: cover;
}
h2 {
    color: #7d5ba6;
    font-weight: bold;
    text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.3);
}
.gr-button-primary {
    background: linear-gradient(135deg, #ffb6b9, #fae3d9);
    color: #333;
    border: none;
    font-weight: bold;
    border-radius: 8px;
    box-shadow: 0 4px 6px rgba(0,0,0,0.1);
}
.gr-button-primary:hover {
    background: linear-gradient(135deg, #fae3d9, #ffb6b9);
    box-shadow: 0 4px 8px rgba(0,0,0,0.2);
}
.container {
    padding: 25px;
    border-radius: 15px;
    background: rgba(255, 255, 255, 0.9);  /* Semi-transparent white background */
    box-shadow: 0 8px 20px rgba(0, 0, 0, 0.1);
}
label {
    font-weight: bold;
    color: #5d3a9b;
}
textarea, input {
    border-radius: 10px !important;
    border: 1px solid #e0c3fc !important;
    padding: 10px !important;
    background: #fff5f9 !important;
    transition: background-color 0.3s ease;
}
textarea:focus, input:focus {
    background: #f4d0e1 !important;
    border-color: #d3a0d0 !important;
}
""") as demo:
    with gr.Row():
        gr.Markdown("<h2 style='text-align:center;'>✨ Customer Review Analysis ✨</h2>")
    with gr.Row():
        with gr.Column(elem_classes="container"):
            file_input = gr.File(label="📂 Upload a CSV/TXT file with 'Review' and 'Sentiment'", type="filepath")
            text_input = gr.Textbox(label="💬 Or manually enter a review:", placeholder="E.g., The product quality was disappointing...", lines=3)

            prompt_input = gr.Textbox(
                label="🔮 Enter a custom prompt for the AI:",
                placeholder="E.g., What are the top problems customers are facing?",
                lines=3
            )

            submit_button = gr.Button("🎀 Generate Suggestions", variant="primary")

        with gr.Column(elem_classes="container"):
            output = gr.Textbox(label="🌸 AI Suggestions (Based on Topics)", interactive=False, lines=10)
            sentiment_output = gr.Textbox(label="📊 Sentiment Analysis Summary", interactive=False)
            trend_plot = gr.Image(label="📈 Sentiment Distribution Plot")

    submit_button.click(process_input, inputs=[file_input, text_input, prompt_input], outputs=[output, sentiment_output, trend_plot])

demo.launch()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://92000b597f6ed4dcef.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


