In [3]:
import os
import types
import pandas as pd
from botocore.client import Config
import ibm_boto3
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from collections import Counter

# IBM COS credentials
cos_client = ibm_boto3.client(
    service_name='s3',
    ibm_api_key_id='Ca8LvwXldHXgjKjvBwqFs1Gbta7TMbuk-KJK5KxKy_WD',
    ibm_auth_endpoint='https://iam.cloud.ibm.com/oidc/token',
    config=Config(signature_version='oauth'),
    endpoint_url='https://s3.us-south.cloud-object-storage.appdomain.cloud'
)

# Load data
bucket = 'bucket-qa4k0okrcix3rwg'
object_key = 'data revised.csv'

body = cos_client.get_object(Bucket=bucket, Key=object_key)['Body']
if not hasattr(body, "iter"):
    body.iter = types.MethodType(iter, body)

data = pd.read_csv(body)
data = data.dropna(subset=["Review", "Rating"])
data['Review'] = data['Review'].astype(str)

# Filter and prepare data
data = data[data['Rating'].isin([1, -1])]
X = data['Review']
y = data['Rating']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Text vectorization
vectorizer = TfidfVectorizer(stop_words='english')
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Model training
model = LogisticRegression(max_iter=1000)
model.fit(X_train_vec, y_train)

# Evaluation
y_pred = model.predict(X_test_vec)
model_accuracy = accuracy_score(y_test, y_pred)

# Sentiment Summary
sentiment_counts = Counter(data['Rating'])

# Strengths and Pain Points
strengths = data[data['Rating'] == 1]['Review']
pain_points = data[data['Rating'] == -1]['Review']

# Keyword Insights
keywords = ["room service", "food quality", "customer service", "clean", "location"]
keyword_insights = {key: 0 for key in keywords}
for review in data['Review']:
    review_lower = review.lower()
    for keyword in keywords:
        if keyword in review_lower:
            keyword_insights[keyword] += 1

# Output
print("Model Accuracy:", model_accuracy)
print("\nCustomer Sentiment Summary:")
for sentiment, count in sentiment_counts.items():
    print(f"{'Positive' if sentiment == 1 else 'Negative'}: {count}")

print("\nStrengths (sample reviews):")
print(strengths.sample(3).tolist() if not strengths.empty else "No positive reviews.")

print("\nPain Points (sample reviews):")
print(pain_points.sample(3).tolist() if not pain_points.empty else "No negative reviews.")

print("\nKeyword Insights:")
for keyword, count in keyword_insights.items():
    print(f"{keyword.capitalize()}: {count} mentions")

Model Accuracy: 0.9434735117422174

Customer Sentiment Summary:
Positive: 15093
Negative: 3214

Strengths (sample reviews):
['great place stay stayed simpsons nights, friendly welcome great advice, room comfortable public rooms character, good breakfast, loved walk city botanical gardens took -11 minutes stay area sydney miss stayed city hotel, interesting places eat nearby, highly recommended fun better value large corporate hotel,  ', "make sure request room -108, place d'armes gorgeous places quarter make sure request room -108 huge fabulous view courtyard/pool, make sure stay walk door dinner muriel makes romantic weekend, total shout frank night manager doll,  ", 'nice hotel great location great place stay certainly choose london bridge hotel trip london, rooms stayed different occasions clean staff helpful friendly continental breakfast sufficient nice fruit salad fresh fruit not tinned kind real treat, suggestion regard rooms reason giving four-star not five-star rating check ma

In [4]:
import csv

# Predict sentiment for all reviews
data['Predicted Sentiment'] = model.predict(vectorizer.transform(data['Review']))
data['Predicted Sentiment'] = data['Predicted Sentiment'].map({1: 'Positive', -1: 'Negative'})

# Define topic keywords
keywords = ["room service", "food quality", "customer service", "clean", "location"]

# Function to extract mentioned topics
def extract_topics(review):
    review_lower = review.lower()
    mentioned = [keyword for keyword in keywords if keyword in review_lower]
    return ", ".join(mentioned) if mentioned else "None"

# Apply topic tagging
data['Topics Mentioned'] = data['Review'].apply(extract_topics)

# Save to CSV
output_path = "review_sentiment_topic_report.csv"
data.to_csv(output_path, index=False)

print(f"✅ CSV report generated: {output_path}")

✅ CSV report generated: review_sentiment_topic_report.csv


In [5]:
import pandas as pd

csv_preview = pd.read_csv("review_sentiment_topic_report.csv")

print("Preview of the generated CSV:")
print(csv_preview.head())

Preview of the generated CSV:
                                              Review  Rating  \
0  nice hotel expensive parking got good deal sta...       1   
1  ok nothing special charge diamond member hilto...      -1   
2  unique, great stay, wonderful time hotel monac...       1   
3  great stay great stay, went seahawk game aweso...       1   
4  love monaco staff husband stayed hotel crazy w...       1   

  Predicted Sentiment Topics Mentioned  
0            Positive  clean, location  
1            Negative            clean  
2            Positive         location  
3            Positive              NaN  
4            Positive              NaN  
