In [1]:
# Packages einlesen
import pandas as pd
import json
import matplotlib.pyplot as plt
import numpy as np

In [51]:
# Daten einlesen
data_utilitarian_mouse = pd.read_csv("/Users/paulahofmann/Documents/Coding/Online-Review/DataPreperation/Meta_Mouse.csv")

# 1. Data Cleaning

In [11]:
# Deleting special characters and review with less than 3 words

# Define the regular expression pattern to match special characters
special_characters_pattern = r'👍|:\)'

# Filter rows where 'text' column contains special characters and keep the negation
data_utilitarian_mouse = data_utilitarian_mouse[~data_utilitarian_mouse['text'].str.contains(special_characters_pattern, regex=True)]

# Now, data_utilitarian_mouse contains only rows where the 'text' column does not contain special characters


# Count the number of words in each row of the 'text' column
word_counts = data_utilitarian_mouse['text'].str.split().str.len()

# Filter rows where the word count is greater than 2
data_utilitarian_mouse = data_utilitarian_mouse[word_counts > 2]

# Now, filtered_data_utilitarian_mouse contains only rows where the 'text' column contains more than two words


In [45]:
# Filter rows where 'text' column is exactly "Es"
es_pattern = r'\btodo \b'
data_utilitarian_mouse = data_utilitarian_mouse[~data_utilitarian_mouse['text'].str.contains(es_pattern)]


In [21]:
# Deleting the string "[[Video ID:...]]" from the review text

# Initialize an empty list to store VideoID and numbers
video_id_and_number_list = []

# Define the regular expression pattern to match the [[VIDEOID:...]] text and extract the VideoID and numbers
video_id_and_number_pattern = r'\[\[VIDEOID:([^\]]*)\]\]'

# Extract VideoID and numbers from each row in the 'text' column and save them into the list
for text in data_utilitarian_mouse['text']:
    matches = re.findall(video_id_and_number_pattern, text)
    for match in matches:
        video_id_and_number_list.append('[[VIDEOID:' + match +"]]")

# Use list to iterate through the 'text' column and replace the VideoID and numbers with an empty string
for video_id_and_number in video_id_and_number_list:
    data_utilitarian_mouse['text'] = data_utilitarian_mouse['text'].str.replace(video_id_and_number, '')

## 2. Text Preprocessing



In [52]:
import spacy

# Load the English language model
nlp = spacy.load("en_core_web_sm")

# Function to preprocess text using spaCy with lemmatization and lowercasing

def preprocess_text(text):
    # Process the text with spaCy
    doc = nlp(text)
    
    # Lemmatization and lowercasing
    processed_text = ' '.join([token.lemma_.lower() for token in doc])
    
    return processed_text




In [53]:
# Adding another column, that filters out stop words and punctuation/symbols
import string

# Define a set of stop words
stop_words = spacy.lang.en.stop_words.STOP_WORDS

# Function to preprocess text using spaCy
def preprocess_text_stops(text):
    # Process the text with spaCy
    doc = nlp(text)
    
    # Lemmatization, lowercasing, and removal of symbols, punctuation, and stop words
    processed_text = ' '.join([token.lemma_.lower() for token in doc if token.is_alpha and not token.is_stop and not token.is_punct])
    
    return processed_text

In [55]:
# Apply the preprocessing function to the "text" column
data_utilitarian_mouse['text_cleaned'] = data_utilitarian_mouse['text'].apply(preprocess_text)

# Apply the preprocessing function with stopword removal to the "text" column
data_utilitarian_mouse['text_cleaned1'] = data_utilitarian_mouse['text'].apply(preprocess_text_stops)



## Using Sentiment Analysis for Review Text

In [56]:
# Download the pre-trained BERT model and tokenizer
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

In [57]:
# Load the pre-trained BERT model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("LiYuan/amazon-review-sentiment-analysis")
sentiment_model = AutoModelForSequenceClassification.from_pretrained("LiYuan/amazon-review-sentiment-analysis")


In [60]:
# Build Function to analyze the sentiment of a text

def analyze_sentiment(text):
    # Tokenize the text
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    # Perform sentiment analysis
    outputs = sentiment_model(**inputs)
    # Get the predicted label
    predicted_label = torch.argmax(outputs.logits)
    return predicted_label.item()  # Return the predicted label as an integer


In [61]:
# Apply the function to the utilitarian product
data_utilitarian_mouse['sentiment'] = data_utilitarian_mouse['text'].apply(analyze_sentiment)


In [62]:
data_utilitarian_mouse.to_csv('data_utilitarian_mouse_Senti.csv', index=False)
