In [1]:
# Packages einlesen
import pandas as pd
import json
import matplotlib.pyplot as plt
import numpy as np

In [2]:
# Daten einlesen
data = pd.read_csv("/Users/paulahofmann/Documents/Coding/Online-Review/DataPreperation/Perfume .csv")

# 1. Data Cleaning

In [4]:
# Deleting special characters and review with less than 3 words
data.dropna(subset=['text'], inplace=True)

# Deleting empty rows
data = data [data['text']!= ' ']

In [5]:

# Count the number of words in each row of the 'text' column
word_counts = data['text'].str.split().str.len()

# Filter rows where the word count is greater than 2
data = data[word_counts > 2]

# Assuming df is your DataFrame
data = data[~data['text'].str.contains('é|ó', case=False, na=False)]

In [12]:
# Filter rows where 'text' column is exactly "Es"
es_pattern = r'\bgracias \b'
data = data[~data['text'].str.contains(es_pattern)]

data = data[~data['text'].str.contains('biene', case=False)]


In [13]:
# Deleting the string "[[Video ID:...]]" from the review text
import re
# Initialize an empty list to store VideoID and numbers
video_id_and_number_list = []

# Define the regular expression pattern to match the [[VIDEOID:...]] text and extract the VideoID and numbers
video_id_and_number_pattern = r'\[\[VIDEOID:([^\]]*)\]\]'

# Extract VideoID and numbers from each row in the 'text' column and save them into the list
for text in data['text']:
    matches = re.findall(video_id_and_number_pattern, text)
    for match in matches:
        video_id_and_number_list.append('[[VIDEOID:' + match +"]]")

# Use list to iterate through the 'text' column and replace the VideoID and numbers with an empty string
for video_id_and_number in video_id_and_number_list:
    data['text'] = data['text'].str.replace(video_id_and_number, '')



  data['text'] = data['text'].str.replace(video_id_and_number, '')
  pat = re.compile(pat, flags=flags)


In [14]:
import pandas as pd

# Assuming you have your data loaded into a DataFrame called df
# df = pd.read_csv('your_data.csv')

# Drop rows where parent_asin equals "B00WXP607C"
data = data[data['parent_asin'] != 'B00WXP607C']

# Now df_filtered contains the rows where parent_asin is not "B00WXP607C"


## 2. Text Preprocessing



In [15]:
import spacy

# Load the English language model
nlp = spacy.load("en_core_web_sm")

# Function to preprocess text using spaCy with lemmatization and lowercasing

def preprocess_text(text):
    # Process the text with spaCy
    doc = nlp(text)
    
    # Lemmatization and lowercasing
    processed_text = ' '.join([token.lemma_.lower() for token in doc])
    
    return processed_text


In [16]:
# Adding another column, that filters out stop words and punctuation/symbols
import string

# Define a set of stop words
stop_words = spacy.lang.en.stop_words.STOP_WORDS

# Function to preprocess text using spaCy
def preprocess_text_stops(text):
    # Process the text with spaCy
    doc = nlp(text)
    
    # Lemmatization, lowercasing, and removal of symbols, punctuation, and stop words
    processed_text = ' '.join([token.lemma_.lower() for token in doc if token.is_alpha and not token.is_stop and not token.is_punct])
    
    return processed_text

In [17]:
# Group the data by Parent ASIN and calculate the total number of helpful votes and the total number of reviews
summary = data.groupby('parent_asin').agg(
    Total_Helpful_Votes=('helpful_vote', 'sum'),  # Total number of helpful votes
    Total_Reviews=('parent_asin', 'size')  # Total number of reviews
)

print (summary)

             Total_Helpful_Votes  Total_Reviews
parent_asin                                    
B07G7FF2WC                   317            128
B07Z8FSKG4                  3555           2761
B0BJMV1QTR                  6399           8217


In [18]:
# Apply the preprocessing function to the "text" column
data['text_cleaned'] = data['text'].apply(preprocess_text)

# Apply the preprocessing function with stopword removal to the "text" column
data['text_cleaned1'] = data['text'].apply(preprocess_text_stops)



## Using Sentiment Analysis for Review Text

In [19]:
# Download the pre-trained BERT model and tokenizer
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

In [20]:
# Load the pre-trained BERT model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("LiYuan/amazon-review-sentiment-analysis")
sentiment_model = AutoModelForSequenceClassification.from_pretrained("LiYuan/amazon-review-sentiment-analysis")


In [21]:
# Build Function to analyze the sentiment of a text

def analyze_sentiment(text):
    # Tokenize the text
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    # Perform sentiment analysis
    outputs = sentiment_model(**inputs)
    # Get the predicted label
    predicted_label = torch.argmax(outputs.logits)
    return predicted_label.item()  # Return the predicted label as an integer


In [22]:
# Apply the function to the utilitarian product
data['sentiment'] = data['text'].apply(analyze_sentiment)


In [25]:
data.to_csv('/Users/paulahofmann/Documents/Coding/Online-Review/SelectingData/PerfumeNewSenti.csv', index=False)
