In [1]:
# Packages einlesen
import pandas as pd
import json
import matplotlib.pyplot as plt
import numpy as np

In [2]:
# Daten einlesen
data = pd.read_csv("/Users/paulahofmann/Documents/Coding/Online-Review/FeaturePreperation/Data_with_Features/UtiliRazor.csv")



# 1. Data Cleaning

In [4]:
# Deleting special characters and review with less than 3 words
data.dropna(subset=['text'], inplace=True)

# Deleting empty rows
data = data [data['text']!= ' ']

In [6]:

# Count the number of words in each row of the 'text' column
word_counts = data['text'].str.split().str.len()

# Filter rows where the word count is greater than 2
data = data[word_counts > 3]

# Assuming df is your DataFrame
data = data[~data['text'].str.contains('é|ó', case=False, na=False)]

In [12]:
# Filter rows where 'text' column is exactly "Es"
es_pattern = r'\bmuchas \b'
data = data[~data['text'].str.contains(es_pattern)]

data = data[~data['text'].str.contains('este', case=False)]


In [13]:
# Deleting the string "[[Video ID:...]]" from the review text
import re
# Initialize an empty list to store VideoID and numbers
video_id_and_number_list = []

# Define the regular expression pattern to match the [[VIDEOID:...]] text and extract the VideoID and numbers
video_id_and_number_pattern = r'\[\[VIDEOID:([^\]]*)\]\]'

# Extract VideoID and numbers from each row in the 'text' column and save them into the list
for text in data['text']:
    matches = re.findall(video_id_and_number_pattern, text)
    for match in matches:
        video_id_and_number_list.append('[[VIDEOID:' + match +"]]")

# Use list to iterate through the 'text' column and replace the VideoID and numbers with an empty string
for video_id_and_number in video_id_and_number_list:
    data['text'] = data['text'].str.replace(video_id_and_number, '')



In [14]:
import re

# Define the regular expression pattern to match the entire [[VIDEOID:...]] text and any following numbers
video_id_and_number_pattern = r'\[\[VIDEOID:([0-9]+)'

# Use regular expressions to remove the string "[[VIDEOID:...]]" and any following numbers from the review text
data['text'] = data['text'].str.replace(video_id_and_number_pattern, '')


  data['text'] = data['text'].str.replace(video_id_and_number_pattern, '')


In [25]:
import re

# Define the regular expression pattern to match the entire [[VIDEOID:...]] text and any following alphanumeric characters
video_id_and_number_pattern = r'\[\[VIDEOID:([^\s]+)'

# Use regular expressions to remove the string "[[VIDEOID:...]]" and any following alphanumeric characters from the review text
data['text_cleaned'] = data['text_cleaned'].str.replace(video_id_and_number_pattern, '')


  data['text_cleaned'] = data['text_cleaned'].str.replace(video_id_and_number_pattern, '')


In [26]:
data.to_csv("/Users/paulahofmann/Documents/Coding/Online-Review/FeaturePreperation/Data_with_Features/Final Data/Utilitarian_Final.csv", index=False)

## 2. Text Preprocessing



In [15]:
import spacy

# Load the English language model
nlp = spacy.load("en_core_web_sm")

# Function to preprocess text using spaCy with lemmatization and lowercasing

def preprocess_text(text):
    # Process the text with spaCy
    doc = nlp(text)
    
    # Lemmatization and lowercasing
    processed_text = ' '.join([token.lemma_.lower() for token in doc])
    
    return processed_text


In [16]:
# Adding another column, that filters out stop words and punctuation/symbols
import string

# Define a set of stop words
stop_words = spacy.lang.en.stop_words.STOP_WORDS

# Function to preprocess text using spaCy
def preprocess_text_stops(text):
    # Process the text with spaCy
    doc = nlp(text)
    
    # Lemmatization, lowercasing, and removal of symbols, punctuation, and stop words
    processed_text = ' '.join([token.lemma_.lower() for token in doc if token.is_alpha and not token.is_stop and not token.is_punct])
    
    return processed_text

In [17]:
# Apply the preprocessing function to the "text" column
data['text_cleaned'] = data['text'].apply(preprocess_text)

# Apply the preprocessing function with stopword removal to the "text" column
data['text_cleaned1'] = data['text'].apply(preprocess_text_stops)


In [18]:
data.to_csv("/Users/paulahofmann/Documents/Coding/Online-Review/FeaturePreperation/Data_with_Features/UtiliRazor.csv", index=False)

## Using Sentiment Analysis for Review Text

In [19]:
# Download the pre-trained BERT model and tokenizer
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

In [20]:
# Load the pre-trained BERT model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("LiYuan/amazon-review-sentiment-analysis")
sentiment_model = AutoModelForSequenceClassification.from_pretrained("LiYuan/amazon-review-sentiment-analysis")


In [21]:
# Build Function to analyze the sentiment of a text

def analyze_sentiment(text):
    # Tokenize the text
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    # Perform sentiment analysis
    outputs = sentiment_model(**inputs)
    # Get the predicted label
    predicted_label = torch.argmax(outputs.logits)
    return predicted_label.item()  # Return the predicted label as an integer


In [22]:
# Apply the function to the utilitarian product
data['sentiment'] = data['text'].apply(analyze_sentiment)


In [4]:

# Define the mapping dictionary for transforming sentiment score to match rating score 1-5, so that 0-4 gets 1-5
sentiment_mapping = {0: 1, 1: 2, 2: 3, 3: 4, 4: 5}

# Apply the mapping to the 'sentiment' column
data['sentiment'] = data['sentiment'].replace(sentiment_mapping)

# Verify the transformation
print(data['sentiment'])


0        5
1        5
2        5
3        3
4        5
        ..
22780    5
22781    1
22782    1
22783    5
22784    5
Name: sentiment, Length: 22785, dtype: int64
