In [45]:
# Importing Libraries
import pandas as pd
import json
import matplotlib.pyplot as plt
import numpy as np

# Sentiment Analysis
For measuring the Sentiment the LiYuan/amazon-review-sentiment-analysis sentiment score was used.

In [53]:
# Reading data hedonic
data_hedonic = pd.read_csv("/Users/paulahofmann/Documents/Coding/Online-Review/2 FeaturePreperation/Data_with_Features/Final Data/Old Data/hedonic_raw.csv")

# Add the Prod_Type column with the corresponding values
data_hedonic['Prod type'] = 'Hedonic'

# Reading data utilitarian
data_utilitarian = pd.read_csv("/Users/paulahofmann/Documents/Coding/Online-Review/2 FeaturePreperation/Data_with_Features/Final Data/Old Data/utilitarian_raw.csv")

# Add the Prod_Type column with the corresponding values
data_utilitarian['Prod type'] = 'Utilitarian'

# Merge data to get one dataset for easier application of sentiment analysis
data = pd.concat([data_utilitarian, data_hedonic])

# 1. Data Cleaning
First of all the review text is cleaned before the apllication of the sentiment analysis. The following code is used to clean the review text:

In [63]:
import re
from langdetect import detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException

# Ensure consistent results from langdetect
DetectorFactory.seed = 0

# Remove rows where 'text' is NaN
data.dropna(subset=['text'], inplace=True)

# Deleting empty rows
data = data[data['text'].str.strip() != '']

# Function to remove special characters from the text
def remove_special_characters(text):
    # Remove everything except alphanumeric characters and spaces
    return re.sub(r'[^A-Za-z0-9\s]', '', text)

# Apply the special character filter
data['text'] = data['text'].apply(remove_special_characters)

# Function to check if the text is not in Spanish
def is_not_spanish(text):
    try:
        return detect(text) != 'es'
    except LangDetectException:
        return True  # If detection fails, keep the review

# Filtering out reviews that are in Spanish
data = data[data['text'].apply(is_not_spanish)]

# Function to filter out reviews with less than 3 words
def has_minimum_words(text):
    return len(text.split()) >= 3

# Apply the word count filter
data = data[data['text'].apply(has_minimum_words)]

# Display the cleaned data
print(data.head())



ModuleNotFoundError: No module named 'langdetect'

In [55]:
# Deleting the string "[[Video ID:...]]" from the review text
import re
# Initialize an empty list to store VideoID and numbers
video_id_and_number_list = []

# Define the regular expression pattern to match the [[VIDEOID:...]] text and extract the VideoID and numbers
video_id_and_number_pattern = r'\[\[VIDEOID:([^\]]*)\]\]'

# Extract VideoID and numbers from each row in the 'text' column and save them into the list
for text in data['text']:
    matches = re.findall(video_id_and_number_pattern, text)
    for match in matches:
        video_id_and_number_list.append('[[VIDEOID:' + match +"]]")

# Use list to iterate through the 'text' column and replace the VideoID and numbers with an empty string
for video_id_and_number in video_id_and_number_list:
    data['text'] = data['text'].str.replace(video_id_and_number, '')



In [56]:
import re

# Define the regular expression pattern to match the entire [[VIDEOID:...]] text and any following numbers
video_id_and_number_pattern = r'\[\[VIDEOID:([0-9]+)'

# Use regular expressions to remove the string "[[VIDEOID:...]]" and any following numbers from the review text
data['text'] = data['text'].str.replace(video_id_and_number_pattern, '')


In [58]:
import re

# Define the regular expression pattern to match the entire [[VIDEOID:...]] text and any following alphanumeric characters
video_id_and_number_pattern = r'\[\[VIDEOID:([^\s]+)'

# Use regular expressions to remove the string "[[VIDEOID:...]]" and any following alphanumeric characters from the review text
data['text_cleaned'] = data['text'].str.replace(video_id_and_number_pattern, '')


# 2. Text Preprocessing



In [59]:
import spacy

# Load the English language model
nlp = spacy.load("en_core_web_sm")

# Function to preprocess text using spaCy with lemmatization and lowercasing

def preprocess_text(text):
    # Process the text with spaCy
    doc = nlp(text)
    
    # Lemmatization and lowercasing
    processed_text = ' '.join([token.lemma_.lower() for token in doc])
    
    return processed_text


In [60]:
# Adding another column, that filters out stop words and punctuation/symbols
import string

# Define a set of stop words
stop_words = spacy.lang.en.stop_words.STOP_WORDS

# Function to preprocess text using spaCy
def preprocess_text_stops(text):
    # Process the text with spaCy
    doc = nlp(text)
    
    # Lemmatization, lowercasing, and removal of symbols, punctuation, and stop words
    processed_text = ' '.join([token.lemma_.lower() for token in doc if token.is_alpha and not token.is_stop and not token.is_punct])
    
    return processed_text

In [61]:
# Apply the preprocessing function to the "text" column
data['text_cleaned'] = data['text'].apply(preprocess_text)

# Apply the preprocessing function with stopword removal to the "text" column
data['text_cleaned1'] = data['text'].apply(preprocess_text_stops)


KeyboardInterrupt: 

# 3. Using Transformer Model for Sentiment Analysis

In the following the amazon-review-sentiment-analysis from Huggingface is employed. 

In [5]:
# Download the pre-trained BERT model and tokenizer
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

In [20]:
# Load the pre-trained BERT model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("LiYuan/amazon-review-sentiment-analysis")
sentiment_model = AutoModelForSequenceClassification.from_pretrained("LiYuan/amazon-review-sentiment-analysis")


In [21]:
# Build Function to analyze the sentiment of a text

def analyze_sentiment(text):
    # Tokenize the text
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    # Perform sentiment analysis
    outputs = sentiment_model(**inputs)
    # Get the predicted label
    predicted_label = torch.argmax(outputs.logits)
    return predicted_label.item()  # Return the predicted label as an integer


In [22]:
# Apply the function to the review text
data['Sentiment'] = data['text'].apply(analyze_sentiment)

In [4]:
# Define the mapping dictionary for transforming sentiment score to match rating score 1-5, so that 0-4 gets 1-5
sentiment_mapping = {0: 1, 1: 2, 2: 3, 3: 4, 4: 5}

# Apply the mapping to the 'sentiment' column
data['Sentiment'] = data['Sentiment'].replace(sentiment_mapping)

# Verify the transformation
print(data['Sentiment'])

0        5
1        5
2        5
3        3
4        5
        ..
22780    5
22781    1
22782    1
22783    5
22784    5
Name: sentiment, Length: 22785, dtype: int64


# 4. Subjectivity Analysis 

The last step involved the application of Subjectivity analysis using cffl/bert-base-styleclassification-subjective-neutral. 

In [62]:
# Building pipeline for text classification with BERT Subjective-Neutral model
from transformers import pipeline

# Define the text classification pipeline
classify = pipeline(
    task="text-classification",
    model="cffl/bert-base-styleclassification-subjective-neutral",
    top_k=None,  # Equivalent to return_all_scores=True
)

# Function to classify review text and return the scores for all labels
def classify_text(text):
    classification_result = classify(text)
    return classification_result[0]

# Function to process longer texts by splitting them into smaller chunks, model can only process texts up to 512 tokens
def process_long_text(text, max_chunk_length=512):
    chunks = [text[i:i+max_chunk_length] for i in range(0, len(text), max_chunk_length)]
    scores = []
    for chunk in chunks:
        chunk_scores = classify_text(chunk)
        scores.extend(chunk_scores)
    return scores

# Apply text classification to each review text in the DataFrame and extract scores for all labels
data['classification_scores'] = data['text_cleaned'].apply(process_long_text)

# Extract subjective and neutral scores separately
subjective_scores = []
neutral_scores = []
for _, row in data_hedonic.iterrows():
    label_scores = row['classification_scores']
    subjective_score = next((entry['score'] for entry in label_scores if entry['label'] == 'SUBJECTIVE'), None)
    neutral_score = next((entry['score'] for entry in label_scores if entry['label'] == 'NEUTRAL'), None)
    subjective_scores.append(subjective_score)
    neutral_scores.append(neutral_score)

# Add the extracted scores as new columns to the DataFrame
data['subjective_score'] = subjective_scores
data['neutral_score'] = neutral_scores

# Drop the 'classification_scores' column as it's no longer needed
data.drop(columns=['classification_scores'], inplace=True)
# Drop the 'neutral__score' column as it's no longer needed
data.drop(columns=['neutral_score'], inplace=True)
# Rename the 'subjective_score' column to 'Subjective' 
data.rename(columns={'subjective_score': 'Subjective'}, inplace=True)

print(data)

KeyboardInterrupt: 

In [None]:
# Filter the DataFrame for Product Type 'Hedonic'
hedonic_df = data[data['Product Type'] == 'Hedonic']

# Filter the DataFrame for Product Type 'Utilitarian'
utilitarian_df = data[data['Product Type'] == 'Utilitarian']

# Save the filtered DataFrames to separate CSV files
#hedonic_df.to_csv('/Users/paulahofmann/Documents/Coding/Online-Review/2 FeaturePreperation/Data_with_Features/Final Data/Old Data/Total_Features_Hedonic_Subj.csv', index=False)
#utilitarian_df.to_csv('/Users/paulahofmann/Documents/Coding/Online-Review/2 FeaturePreperation/Data_with_Features/Final Data/Old Data/Total_Features_Utilitarian_Subj.csv', index=False)

