# Social Review Analysis

#### This notebook analyses Social Media Reviews to understand:
* the sentiment
* Insight
* Summary 

for each of the month and each of the category.



In [None]:
%pip install openai

#### Library

In [None]:
from pyspark.sql import functions as F
from pyspark.sql import Row
from itertools import count
from openai import AzureOpenAI, RateLimitError, OpenAIError, APIError
import time

#### Load the Social Review Table

In [None]:
df_raw = spark.sql("SELECT * FROM AdventureWorks_Lakehouse.Ops_Silver.Social_Reviews")
display(df_raw)

In [None]:
df = df_raw.dropna() #Drop rows with n.a.
print(df_raw.count(), df.count()) #Compare rows with and without n.a.

In [None]:
df.show()

In [None]:
df_filtered = df.filter((F.col("PostedYearMonth") >= "2013-01") & (F.col("PostedYearMonth") <= "2014-06"))
df_grouped = df_filtered.groupBy("PostedYearMonth", "ProductModelName","ProductModelId", "ReviewSource") \
                   .agg(F.collect_list("Review").alias("Reviews_List"))

df_grouped = df_grouped.orderBy("PostedYearMonth", "ProductModelId")
df_grouped.show()

#### Format Reviews

In [None]:
def format_reviews(reviews):
    return [f"Review {i + 1}: {review}.\n" for i, review in enumerate(reviews)]

#### Insert reviews as a column

In [None]:
format_reviews_udf = F.udf(format_reviews, returnType=F.ArrayType(F.StringType()))
df_formatted = df_grouped.withColumn("FormattedReviews", format_reviews_udf(F.col("Reviews_List")))
df_final = df_formatted.withColumn("Reviews_Concatenated", F.concat_ws(" ", F.col("FormattedReviews")))
df_final = df_final.drop("Reviews_List", "FormattedReviews")
display(df_final)

#### Function for LLM inputs

In [None]:
# Key in the end point and the key
ENDPOINT = "Your Endpoint"
API_KEY = "Your API"

In [None]:
# General GPT call function
def estimate_productModel(system_message, review, max_retries=5, initial_wait_time=20):
    API_VERSION = "2024-02-01"
    MODEL_NAME = "gpt-4-turbo-2024-04-09"
    retry_count = 0
    wait_time = initial_wait_time

    client = AzureOpenAI(
        azure_endpoint=ENDPOINT,
        api_key=API_KEY,
        api_version=API_VERSION,
    )

    MESSAGES = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": review},]

    while retry_count < max_retries:
        try:
            completion = client.chat.completions.create(
                model=MODEL_NAME,
                messages=MESSAGES,
            )
            response = completion.choices[0].message.content
            print (response)
            return response

        except RateLimitError as e:
            print(f"Rate limit error (429) encountered: {e}. Retrying {retry_count + 1}/{max_retries} after {wait_time} seconds...")
            retry_count += 1
            time.sleep(wait_time)
            wait_time *= 2  

        except APIError as e:
            print(f"Server error (500) encountered: {e}. Retrying {retry_count + 1}/{max_retries} after {wait_time} seconds...")
            retry_count += 1
            time.sleep(wait_time)
            wait_time *= 2  

        except OpenAIError as e:
            print("An unexpected OpenAI error occurred:", e)
            break  

    return "N.A."

In [None]:
# Sentiment GPT call function
def llm_sentiment(review):
    system_message = "Please review the list of review message and decide if the overall sentiment is *Positive*, *Neutral*, or *Negative*. \
    If there are reviews contains equal amount of positive and negative, the sentiment is Neutral.\
    If the reviews has more positive than negative, its Positive.\
    If the reviews has less positive than negative, its Negative.\
    Please only reply with Positive, Negative or Neutrual."
    sentiment_analysis = estimate_productModel(system_message, review)
    return sentiment_analysis

In [None]:
# Insight GPT call function
def llm_insight(review):
    system_message = "Please provide insight within the list of reviews.\
    Its mainly to extract what are the key points within the reviews.\
    Please do not comment the reviews respectively but ensure providing the overall insight.\
    Please restrain the response within 2 sentence."
    insight_analysis = estimate_productModel(system_message, review)
    return insight_analysis

In [None]:
# Summary GPT call function
def llm_summary(review):
    system_message = "Please provide high-level summary within the list of reviews.\
    Please restrain the response within 2 senstence.\
    Please do not comment the reviews respectively but ensure providing the overall summary.\
    This should include the extracted insight and the ovarall sentiment."
    summary_analysis = estimate_productModel(system_message, review)
    return summary_analysis

In [None]:
output_data = []
id_counter = count(1)
batch_size = 60
for row in df_final.collect():
    year_month = row['PostedYearMonth']
    product_model_name = row['ProductModelName']
    product_model_id = row['ProductModelId']
    review_source = row['ReviewSource']
    review = row['Reviews_Concatenated']

    print ("review")
    print (review)

    print ("average_sentiment")
    average_sentiment = llm_sentiment(review)
    output_data.append(Row(
        ID=next(id_counter),
        YearMonth=year_month,
        ProductModelId=product_model_id,
        ProductModelName=product_model_name,
        ReviewSource=review_source,
        LlmKey="AverageSentiment",
        LlmValue=average_sentiment
    ))

    print ("extracted_insights")
    extracted_insights = llm_insight(review)
    output_data.append(Row(
        ID=next(id_counter),
        YearMonth=year_month,
        ProductModelId=product_model_id,
        ProductModelName=product_model_name,
        ReviewSource=review_source,
        LlmKey="ExtractedInsights",
        LlmValue=extracted_insights
    ))

    print ("Summary")
    summary = llm_summary(review)
    output_data.append(Row(
        ID=next(id_counter),
        YearMonth=year_month,
        ProductModelId=product_model_id,
        ProductModelName=product_model_name,
        ReviewSource=review_source,
        LlmKey="Summary",
        LlmValue=summary
    ))


    if len(output_data) >= batch_size:
        # Convert to DataFrame and save the current batch
        batch_df = spark.createDataFrame(output_data)
        batch_df.write.mode("append").saveAsTable("AdventureWorks_Lakehouse.Ops_Gold.Fact_AiSocialReviewAnalysis")
        
        # Clear the batch after saving
        output_data.clear()
        print("Batch saved successfully.")

# Save any remaining entries in output_data if they don’t make up a full batch
if output_data:
    batch_df = spark.createDataFrame(output_data)
    batch_df.write.mode("append").saveAsTable("AdventureWorks_Lakehouse.Ops_Gold.Fact_AiSocialReviewAnalysis")
    print("Final batch saved successfully.")

In [None]:
df_output = spark.createDataFrame(output_data)

In [None]:
df_output.show()

In [None]:
df = spark.sql("SELECT * FROM AdventureWorks_Lakehouse.Ops_Gold.fact_aisocialreviewanalysis LIMIT 1000")
display(df)

In [None]:
df.count()