<a href="https://colab.research.google.com/github/RizkyWidodo-project/IBMGraniteCourse/blob/main/United_Health_Provider_Review_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Install Package

In [1]:
!pip install langchain_community
!pip install replicate

Collecting langchain_community
  Downloading langchain_community-0.3.27-py3-none-any.whl.metadata (2.9 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain_community)
  Downloading pydantic_settings-2.10.1-py3-none-any.whl.metadata (3.4 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain_community)
  Downloading httpx_sse-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain_community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain_community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting python-dotenv>=0.21.0 (from pydantic-settings<3.0.0,>=2.4.0->langchain_community)
  Downloading python_dotenv-1.1.1-py3-none-any.whl.metadata (24 k

In [17]:
from langchain_community.llms import Replicate
import os
from google.colab import userdata

# Set the API token
api_token = userdata.get('REPLICATE_API_TOKEN')
os.environ["REPLICATE_API_TOKEN"] = api_token

# Model setup
model = "ibm-granite/granite-3.2-8b-instruct"
output = Replicate(
model=model,
replicate_api_token=api_token,
model_kwargs={"max_tokens": 1000} # Increase max_tokens
)

#Model Query

##Load dataset

In [21]:
import pandas as pd

# Load the dataset from a CSV file
try:
    reviews_df = pd.read_csv('United_Review100.csv')
    customer_reviews = reviews_df['Review'].tolist()
    # Print the preview of the dataset
    print("Preview of the dataset:")
    display(reviews_df.head())
except FileNotFoundError:
    print("Error:Review File not found. Please upload the file.")
    customer_reviews = []

Preview of the dataset:


Unnamed: 0,No,Review
0,1,Have not been able to access my account for a ...
1,2,I have been trying to obtain the 2023 fee Sche...
2,3,Dental and prescription cover is good. Vision ...
3,4,If you are looking into using All Savers Insur...
4,5,I have had United for several years. In Januar...


##Prompting

In [22]:
responses = []
for i, review in enumerate(customer_reviews):
    # Refine the prompt for a single review, including examples
    Multitask_prompt = f"""
Classify the review as positive, negative, or mixed without any further explanation.
Next, identify relevant categories (only for customer service, waiting time, medical-related service satisfaction) and its each sentiment as positive, negative, or mixed
Present your response based on this format:
General Sentiment:
Aspect Sentiment:
- customer service
- waiting time
- medical-related satisfaction

Review {i+1}: {review}
"""
    # Set model parameters for prompting with adjusted values
    parameters = {
"top_k": 1,
"top_p": 0.5,
"max_tokens": 5,
"min_tokens": 0,
"random_seed": None,
"repetition_penalty": 1.5,
"stopping_criteria": "length",
"stopping_sequence": None
}
    # Invoke the model with the updated prompt for the single review
    response = output.invoke(Multitask_prompt, parameters=parameters)
    responses.append(f"Review {i+1}:\n{response}\n")

# Print all the responses
for response in responses:
    print(response)

Review 1:
General Sentiment: Negative

Aspect Sentiment:
- customer service: Negative
- waiting time: Negative
- medical-related satisfaction: N/A (not applicable)

Review 2:
General Sentiment: Negative

Aspect Sentiment:
- customer service: Negative
- waiting time: Mixed (implied, as the user hasn't received a response despite multiple attempts)
- medical-related satisfaction: Not applicable (this review does not discuss medical-related service satisfaction)

Review 3:
General Sentiment: Negative

Aspect Sentiment:
- customer service: Mixed (nice, but unable to address major issues)
- waiting time: Negative (long wait for in-person care)
- medical-related satisfaction: Negative (dissatisfaction with vision and medical coverage, high deductibles, and virtual care recommendations)

Review 4:
General Sentiment: Negative

Aspect Sentiment:
- customer service: Negative
- waiting time: N/A
- medical-related satisfaction: Negative

Review 5:
General Sentiment: Negative

Aspect Sentiment:
- c

#Turn the Response into table

In [23]:
import pandas as pd
import re

data = []
current_review_data = {}

def extract_simple_sentiment(sentiment_text):
    """Extracts the simple sentiment ('Positive', 'Negative', 'Mixed', 'Not mentioned')
       from the sentiment text, handling variations and case.
    """
    if pd.isna(sentiment_text):
        return 'Not mentioned'
    text = sentiment_text.lower()
    if 'positive' in text:
        return 'Positive'
    elif 'negative' in text:
        return 'Negative'
    elif 'mixed' in text:
        return 'Mixed'
    elif 'not mentioned' in text or 'n/a' in text or 'not applicable' in text:
        return 'Not mentioned'
    else:
        return 'Not mentioned' # Default for anything else


for response in responses:
    # Extract review number
    review_match = re.match(r"Review (\d+):", response)
    if review_match:
        if current_review_data: # Save previous review's data
            data.append(current_review_data)
        current_review_data = {"Review": f"Review {review_match.group(1)}"}

    # Extract General Sentiment
    general_sentiment_match = re.search(r"General Sentiment: (.+)", response)
    if general_sentiment_match:
        current_review_data["General Sentiment"] = extract_simple_sentiment(general_sentiment_match.group(1).strip())


    # Extract Aspect Sentiment
    aspect_sentiment_matches = re.findall(r"- (customer service|waiting time|medical-related satisfaction): (.+)", response)
    for aspect, sentiment in aspect_sentiment_matches:
        current_review_data[aspect.strip()] = extract_simple_sentiment(sentiment.strip())


# Add the last review's data
if current_review_data:
    data.append(current_review_data)

# Create a DataFrame
df_sentiments = pd.DataFrame(data)

# Fill missing values with 'Not mentioned' or a similar indicator
df_sentiments = df_sentiments.fillna('Not mentioned')


# Display the DataFrame
display(df_sentiments)

Unnamed: 0,Review,General Sentiment,customer service,waiting time,medical-related satisfaction
0,Review 1,Negative,Negative,Negative,Not mentioned
1,Review 2,Negative,Negative,Mixed,Not mentioned
2,Review 3,Negative,Mixed,Negative,Negative
3,Review 4,Negative,Negative,Not mentioned,Negative
4,Review 5,Negative,Negative,Not mentioned,Negative
...,...,...,...,...,...
95,Review 96,Negative,Negative,Mixed,Negative
96,Review 97,Negative,Not mentioned,Not mentioned,Negative
97,Review 98,Positive,Positive,Not mentioned,Positive
98,Review 99,Positive,Positive,Not mentioned,Positive


In [24]:
df_sentiments.to_csv('sentiment_analysis_results.csv', index=False)