In [1]:
import pandas as pd

# Load the data
column_names = ['ReviewID', 'CustomerID', 'ProductID', 'ReviewDate', 'Rating', 'ReviewText']
df = pd.read_csv(r'C:\Users\drewr\OneDrive\Escritorio\Google Cloud Training\Sentiment Analisis Python.csv', 
                 header=None, names=column_names, sep=';', quotechar='"', escapechar='\\', engine='python')

# Check for missing values
print("Missing values in each column:")
print(df.isnull().sum())

# Drop rows with missing ReviewText (if any)
df = df.dropna(subset=['ReviewText'])

# Convert ReviewDate to datetime (if needed)
df['ReviewDate'] = pd.to_datetime(df['ReviewDate'])

# Display the first few rows of the DataFrame
print("\nFirst 5 rows of the DataFrame:")
df.head()

Missing values in each column:
ReviewID      0
CustomerID    0
ProductID     0
ReviewDate    0
Rating        0
ReviewText    0
dtype: int64

First 5 rows of the DataFrame:


Unnamed: 0,ReviewID,CustomerID,ProductID,ReviewDate,Rating,ReviewText
0,1,77,18,2023-12-23,3,"Average experience, nothing special."
1,2,80,19,2024-12-25,5,The quality is top-notch.
2,3,50,13,2025-01-26,4,Five stars for the quick delivery.
3,4,78,15,2025-04-21,3,"Good quality, but could be cheaper."
4,5,64,2,2023-07-16,3,"Average experience, nothing special."


In [2]:
%pip install textblob
from textblob import TextBlob

# Function to get sentiment polarity
def get_sentiment(text):
    return TextBlob(text).sentiment.polarity

# Apply sentiment analysis to the ReviewText column
df['Sentiment'] = df['ReviewText'].apply(get_sentiment)

# Display the DataFrame with the new Sentiment column
print("\nDataFrame with Sentiment Analysis:")
df.head()

Note: you may need to restart the kernel to use updated packages.

DataFrame with Sentiment Analysis:


Unnamed: 0,ReviewID,CustomerID,ProductID,ReviewDate,Rating,ReviewText,Sentiment
0,1,77,18,2023-12-23,3,"Average experience, nothing special.",0.103571
1,2,80,19,2024-12-25,5,The quality is top-notch.,1.0
2,3,50,13,2025-01-26,4,Five stars for the quick delivery.,0.333333
3,4,78,15,2025-04-21,3,"Good quality, but could be cheaper.",0.7
4,5,64,2,2023-07-16,3,"Average experience, nothing special.",0.103571


In [3]:
def hybrid_sentiment_category(row):
    rating = row['Rating']
    sentiment = row['Sentiment']
    
    # Define rules based on rating and sentiment
    if rating >= 4:
        if sentiment >= 0.1:  # Strong alignment (high rating + positive sentiment)
            return 'Positive'
        elif sentiment <= -0.1:  # Conflict (high rating but negative sentiment)
            return 'Neutral (Check)'  # Flag for manual review
        else:  # Neutral sentiment
            return 'Positive (Weak)'  # Assume rating dominates
    
    elif rating <= 2:
        if sentiment <= -0.1:  # Strong alignment (low rating + negative sentiment)
            return 'Negative'
        elif sentiment >= 0.1:  # Conflict (low rating but positive sentiment)
            return 'Neutral (Check)'  # Flag for manual review
        else:  # Neutral sentiment
            return 'Negative (Weak)'  # Assume rating dominates
    
    else:  # Rating = 3 (neutral)
        if sentiment > 0.1:
            return 'Positive'
        elif sentiment < -0.1:
            return 'Negative'
        else:
            return 'Neutral'
        
    # Apply the hybrid categorization
df['HybridSentiment'] = df.apply(hybrid_sentiment_category, axis=1)

# Display results
print("\nDataFrame with Hybrid Sentiment Categories:")
print(df[['ReviewID', 'Rating', 'Sentiment', 'HybridSentiment']].head(10))

# Count sentiment categories
print("\nCount of Hybrid Sentiment Categories:")
print(df['HybridSentiment'].value_counts())


DataFrame with Hybrid Sentiment Categories:
   ReviewID  Rating  Sentiment HybridSentiment
0         1       3   0.103571        Positive
1         2       5   1.000000        Positive
2         3       4   0.333333        Positive
3         4       3   0.700000        Positive
4         5       3   0.103571        Positive
5         6       4   0.200000        Positive
6         7       3   0.103571        Positive
7         8       5   1.000000        Positive
8         9       4   0.625000        Positive
9        10       5   0.600000        Positive

Count of Hybrid Sentiment Categories:
HybridSentiment
Positive           1015
Negative            200
Negative (Weak)      51
Positive (Weak)      49
Neutral (Check)      48
Name: count, dtype: int64


In [4]:
# Flag conflicting cases
conflict_mask = df['HybridSentiment'].str.contains('Check')
conflicting_reviews = df[conflict_mask][['ReviewID', 'ReviewText', 'Rating', 'Sentiment', 'HybridSentiment']]

print("\nConflicting Reviews (Manual Check Recommended):")
print(conflicting_reviews)


Conflicting Reviews (Manual Check Recommended):
      ReviewID                                         ReviewText  Rating  \
57          58  The product  is okay, but the instructions wer...       2   
61          62               Average experience, nothing special.       2   
98          99               Average experience, nothing special.       2   
105        106               Average experience, nothing special.       2   
113        114               Average experience, nothing special.       2   
120        121                Good quality, but could be cheaper.       2   
127        128               Average experience, nothing special.       2   
146        147               Average experience, nothing special.       2   
147        148               Average experience, nothing special.       1   
227        228  The product is okay, but the instructions were...       2   
289        290               Average experience, nothing special.       2   
353        354             

In [5]:
import pandas as pd
from textblob import TextBlob

# Load the data
column_names = ['ReviewID', 'CustomerID', 'ProductID', 'ReviewDate', 'Rating', 'ReviewText']
df = pd.read_csv(r'C:\Users\drewr\OneDrive\Escritorio\Google Cloud Training\Sentiment Analisis Python.csv', 
                 header=None, names=column_names, sep=';', quotechar='"', escapechar='\\', engine='python')

# Check for missing values
print("Missing values in each column:")
print(df.isnull().sum())

# Drop rows with missing ReviewText
df = df.dropna(subset=['ReviewText'])

# Convert ReviewDate to datetime
df['ReviewDate'] = pd.to_datetime(df['ReviewDate'])

# Function to get sentiment polarity
def get_sentiment(text):
    return TextBlob(text).sentiment.polarity

# Add sentiment polarity column
df['Sentiment'] = df['ReviewText'].apply(get_sentiment)

# Hybrid sentiment categorization function
def hybrid_sentiment_category(row):
    rating = row['Rating']
    sentiment = row['Sentiment']
    
    if rating >= 4:
        if sentiment >= 0.1:
            return 'Positive'
        elif sentiment <= -0.1:
            return 'Neutral (Check)'
        else:
            return 'Positive (Weak)'
    elif rating <= 2:
        if sentiment <= -0.1:
            return 'Negative'
        elif sentiment >= 0.1:
            return 'Neutral (Check)'
        else:
            return 'Negative (Weak)'
    else:  # Rating = 3
        if sentiment > 0.1:
            return 'Positive'
        elif sentiment < -0.1:
            return 'Negative'
        else:
            return 'Neutral'

# Apply hybrid categorization
df['HybridSentiment'] = df.apply(hybrid_sentiment_category, axis=1)

# Function to create sentiment buckets
def categorize_sentiment_bucket(hybrid_sentiment):
    if 'Check' in hybrid_sentiment:
        return 'Conflict - Requires Review'
    elif 'Weak' in hybrid_sentiment:
        if 'Positive' in hybrid_sentiment:
            return 'Rating-Driven Positive'
        elif 'Negative' in hybrid_sentiment:
            return 'Rating-Driven Negative'
    else:
        return f'Aligned {hybrid_sentiment}'

# Create sentiment bucket column
df['SentimentBucket'] = df['HybridSentiment'].apply(categorize_sentiment_bucket)

# Display the updated DataFrame
print("\nUpdated DataFrame with Sentiment Buckets:")
df.head(30)

Missing values in each column:
ReviewID      0
CustomerID    0
ProductID     0
ReviewDate    0
Rating        0
ReviewText    0
dtype: int64

Updated DataFrame with Sentiment Buckets:


Unnamed: 0,ReviewID,CustomerID,ProductID,ReviewDate,Rating,ReviewText,Sentiment,HybridSentiment,SentimentBucket
0,1,77,18,2023-12-23,3,"Average experience, nothing special.",0.103571,Positive,Aligned Positive
1,2,80,19,2024-12-25,5,The quality is top-notch.,1.0,Positive,Aligned Positive
2,3,50,13,2025-01-26,4,Five stars for the quick delivery.,0.333333,Positive,Aligned Positive
3,4,78,15,2025-04-21,3,"Good quality, but could be cheaper.",0.7,Positive,Aligned Positive
4,5,64,2,2023-07-16,3,"Average experience, nothing special.",0.103571,Positive,Aligned Positive
5,6,81,1,2025-12-21,4,Customer support was very helpful.,0.2,Positive,Aligned Positive
6,7,16,1,2024-01-29,3,"Average experience, nothing special.",0.103571,Positive,Aligned Positive
7,8,55,8,2024-08-15,5,The quality is top-notch.,1.0,Positive,Aligned Positive
8,9,3,13,2023-09-01,4,"I love this product, will buy again!",0.625,Positive,Aligned Positive
9,10,78,6,2024-06-17,5,"Excellent product, highly recommend!",0.6,Positive,Aligned Positive


In [6]:
# Save the updated DataFrame to a CSV file
output_path = r'C:\Users\drewr\OneDrive\Escritorio\Google Cloud Training\Sentiment_Analysis_Results_With_Insights1.csv'
df.to_csv(output_path, index=False)
print(f"\nUpdated DataFrame saved to: {output_path}")


Updated DataFrame saved to: C:\Users\drewr\OneDrive\Escritorio\Google Cloud Training\Sentiment_Analysis_Results_With_Insights1.csv
