In [None]:
# Write your imports here

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import re
from collections import Counter

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans

from transformers import pipeline, DistilBertTokenizer, DistilBertForSequenceClassification
import torch


# Working with Text Lab
## Information retrieval, preprocessing, and feature extraction

In this lab, you'll be looking at and exploring European restaurant reviews. The dataset is rather tiny, but that's just because it has to run on any machine. In real life, just like with images, texts can be several terabytes long.

The dataset is located [here](https://www.kaggle.com/datasets/gorororororo23/european-restaurant-reviews) and as always, it's been provided to you in the `data/` folder.

### Problem 1. Read the dataset (1 point)
Read the dataset, get acquainted with it. Ensure the data is valid before you proceed.

How many observations are there? Which country is the most represented? What time range does the dataset represent?

Is the sample balanced in terms of restaurants, i.e., do you have an equal number of reviews for each one? Most importantly, is the dataset balanced in terms of **sentiment**?

In [None]:
reviews = pd.read_csv('data\European Restaurant Reviews.csv')

reviews.shape

reviews.columns

In [None]:
reviews['Country'].max()

Now we are going to clean up the date column in order to prepare it for datetime conversion and then we are going to extract the range.

In [None]:
reviews['Review Date'] = reviews['Review Date'].str.extract(r'([A-Za-z]+\s+\d{4})', expand=False)


In [None]:
reviews['Review Date'] = reviews['Review Date'].str.replace('Sept', 'Sep')

In [None]:
reviews['Review Date'] = pd.to_datetime(reviews['Review Date'], format='%b %Y')


In [None]:
reviews['Review Date'] = reviews['Review Date'].dt.to_period('M')

In [None]:
min_date = reviews['Review Date'].min()
max_date = reviews['Review Date'].max()

print(f"Date range: {min_date} to {max_date}")


Next, focusing on sentiments. 

In [None]:
restaurant_counts = reviews['Restaurant Name'].value_counts()

print(restaurant_counts)


In [None]:
sentiment_counts = reviews['Sentiment'].value_counts()
print("\nReviews per sentiment:\n", sentiment_counts)

The dataset is unbalanced in terms of both restaurant reviews and sentiment.

### Problem 2. Getting acquainted with reviews (1 point)
Are positive comments typically shorter or longer? Try to define a good, robust metric for "length" of a text; it's not necessary just the character count. Can you explain your findings?

We are going to use word count, sentence count and average word length. Word count will show us tendencies in review length for positive and for negative reviews. Sentence count will represent similar insight, while word length will show us the complexity of the words used.

In [None]:
reviews['Word Count'] = reviews['Review'].apply(lambda x: len(str(x).split()))
print(reviews[['Review', 'Word Count']].head())


In [None]:
avg_word_count_per_sentiment = reviews.groupby('Sentiment')['Word Count'].mean()
print(avg_word_count_per_sentiment)


In [None]:
sns.boxplot(x='Sentiment', y='Word Count', data=reviews)
plt.title('Distribution of Review Length by Sentiment')
plt.show()

The average word length being greater for positive reviews tells us that users perhaps tend to elaborate when giving a bad review, while they don't need to delve into specifics when giving good feedback.

In [None]:
reviews['Sentence Count'] = reviews['Review'].apply(lambda x: len(nltk.sent_tokenize(x)))

In [None]:
reviews['Average Word Length'] = reviews['Review'].apply(lambda x: np.mean([len(word) for word in re.findall(r'\w+', x)]))

In [None]:
avg_sentence_count_per_sentiment = reviews.groupby('Sentiment')['Sentence Count'].mean()
print(avg_sentence_count_per_sentiment)

In [None]:
avg_word_length_per_sentiment = reviews.groupby('Sentiment')['Average Word Length'].mean()
print(avg_word_length_per_sentiment)

### Problem 3. Preprocess the review content (2 points)
You'll likely need to do this while working on the problems below, but try to synthesize (and document!) your preprocessing here. Your tasks will revolve around words and their connection to sentiment. While preprocessing, keep in mind the domain (restaurant reviews) and the task (sentiment analysis).

Step 1: Normalization (lower case, remove punctuation, remove special characters)

In [None]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation and special characters
    text = re.sub(r'[^\w\s]', '', text)
    return text

reviews['Processed Review'] = reviews['Review'].apply(preprocess_text)

Step 2: Tokenization (break down the text into individual words)

In [None]:
def tokenize_text(text):
    return word_tokenize(text)

reviews['Tokens'] = reviews['Processed Review'].apply(tokenize_text)

Step 3: Removing stop words

In [None]:
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def remove_stop_words(tokens):
    return [word for word in tokens if word not in stop_words]

reviews['Tokens Without Stop Words'] = reviews['Tokens'].apply(remove_stop_words)

Step 4: Lemmatization (for the purpose of our sentiment analysis we are going to perform lemmatization because it provides more accuracy than stemming)

In [None]:
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

def lemmatize_tokens(tokens):
    return [lemmatizer.lemmatize(word) for word in tokens]

reviews['Lemmatized Tokens'] = reviews['Tokens Without Stop Words'].apply(lemmatize_tokens)

Step 5: Handling rare words (remove tokens that appear less frequently)

In [None]:
all_tokens = [token for sublist in reviews['Lemmatized Tokens'] for token in sublist]

token_freq = Counter(all_tokens)

threshold = 5
rare_words = set(word for word, count in token_freq.items() if count < threshold)

def filter_rare_words(tokens):
    return [word for word in tokens if word not in rare_words]

reviews['Filtered Tokens'] = reviews['Lemmatized Tokens'].apply(filter_rare_words)

### Problem 3. Top words (1 point)
Use a simple word tokenization and count the top 10 words in positive reviews; then the top 10 words in negative reviews*. Once again, try to define what "top" words means. Describe and document your process. Explain your results.

\* Okay, you may want to see top N words (with $N \ge 10$).

First, we filter reviews by sentiment.

In [None]:
positive_reviews = reviews[reviews['Sentiment'] == 'Positive']
negative_reviews = reviews[reviews['Sentiment'] == 'Negative']

Second, we flatten the list of tokens and count their frequencies.

In [None]:
positive_tokens = [token for sublist in positive_reviews['Filtered Tokens'] for token in sublist]
negative_tokens = [token for sublist in negative_reviews['Filtered Tokens'] for token in sublist]

positive_token_freq = Counter(positive_tokens)
negative_token_freq = Counter(negative_tokens)

Then we extract the top 10 words for both positive and negative reviews and display the results.

In [None]:
top_10_positive_words = positive_token_freq.most_common(10)
top_10_negative_words = negative_token_freq.most_common(10)

top_10_positive_df = pd.DataFrame(top_10_positive_words, columns=['Word', 'Frequency'])
top_10_negative_df = pd.DataFrame(top_10_negative_words, columns=['Word', 'Frequency'])

print("Top 10 Words in Positive Reviews:")
print(top_10_positive_df)
print("\nTop 10 Words in Negative Reviews:")
print(top_10_negative_df)

In [None]:
fig, ax = plt.subplots(2, 1, figsize=(12, 10), sharex=True)

# Plot top words in positive reviews
ax[0].barh(top_10_positive_df['Word'], top_10_positive_df['Frequency'], color='skyblue')
ax[0].set_title('Top 10 Words in Positive Reviews')
ax[0].set_xlabel('Frequency')
for index, value in enumerate(top_10_positive_df['Frequency']):
    ax[0].text(value, index, f'{value}', va='center')

# Plot top words in negative reviews
ax[1].barh(top_10_negative_df['Word'], top_10_negative_df['Frequency'], color='salmon')
ax[1].set_title('Top 10 Words in Negative Reviews')
ax[1].set_xlabel('Frequency')
for index, value in enumerate(top_10_negative_df['Frequency']):
    ax[1].text(value, index, f'{value}', va='center')

plt.tight_layout()
plt.show()

The high frequency of the word "would" in the negative reviews speaks of people perhaps finding themselves more willing to visit other places after trying a given 'bad' restaurant. The appearance of "good" as a common word in bad reviews hints that it could have been used with negation, such as "not very good" or something similar.

Food appearing high in frequency for both negative and positive results would mean that the people writing feedback bestow high importance on it. Similar insight can be drawn about wine and service.

In positive reviews, the word "staff" appears, unlike in negative ones, which could hint that people tend to praise professional staff but not criticize them directly when giving bad reviews.

### Problem 4. Review titles (2 point)
How do the top words you found in the last problem correlate to the review titles? Do the top 10 words (for each sentiment) appear in the titles at all? Do reviews which contain one or more of the top words have the same words in their titles?

Does the title of a comment present a good summary of its content? That is, are the titles descriptive, or are they simply meant to catch the attention of the reader?

In [None]:
top_positive_words = set(word for word, _ in top_10_positive_words)
top_negative_words = set(word for word, _ in top_10_negative_words)

def contains_top_words(text, top_words):
    return any(word in top_words for word in word_tokenize(text.lower()))

reviews['Title Contains Top Positive Words'] = reviews['Review Title'].apply(lambda x: contains_top_words(x, top_positive_words))
reviews['Title Contains Top Negative Words'] = reviews['Review Title'].apply(lambda x: contains_top_words(x, top_negative_words))

positive_in_titles = reviews['Title Contains Top Positive Words'].sum()
negative_in_titles = reviews['Title Contains Top Negative Words'].sum()

print(f"Number of titles containing top positive words: {positive_in_titles}")
print(f"Number of titles containing top negative words: {negative_in_titles}")

In [None]:
def title_contains_review_words(review_tokens, title_tokens):
    return any(word in title_tokens for word in review_tokens)

reviews['Title Matches Review'] = reviews.apply(
    lambda row: title_contains_review_words(row['Filtered Tokens'], word_tokenize(row['Review Title'].lower())),
    axis=1
)

matching_titles_count = reviews['Title Matches Review'].sum()
print(f"Number of reviews where the title contains words from the review content: {matching_titles_count}")

In [None]:
sample_reviews = reviews[reviews['Title Matches Review']].sample(n=10)
print(sample_reviews[['Review Title', 'Review']])

In [None]:
sentiment_distribution = reviews[reviews['Title Matches Review']]['Sentiment'].value_counts()
print(sentiment_distribution)


In [None]:
sns.countplot(data=reviews[reviews['Title Matches Review']], x='Sentiment')
plt.title('Sentiment Distribution for Reviews with Matching Titles')
plt.show()

According to this section of our analysis, the titles of the reviews tend to be somewhat descriptive based on the high number of matches suggesting an alignment between the most frequent words, review content and the title. The sentiment distribution plot shows a significant difference between negative and positive reviews with matching titles, which may also be due to the initial imbalance of their amounts.

### Problem 5. Bag of words (1 point)
Based on your findings so far, come up with a good set of settings (hyperparameters) for a bag-of-words model for review titles and contents. It's easiest to treat them separately (so, create two models); but you may also think about a unified representation. I find the simplest way of concatenating the title and content too simplistic to be useful, as it doesn't allow you to treat the title differently (e.g., by giving it more weight).

The documentation for `CountVectorizer` is [here](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html). Familiarize yourself with all settings; try out different combinations and come up with a final model; or rather - two models :).

We will need separate set of hyperparameters for titles and review content. So first, we will focus on setting up and tuning a model for the titles; we will filter rare words and focus on concise but informative phrases. As for the content, there we will need more detailed analysis, we will remove rare and overly common words, as they may not be as informative.

In [None]:
title_vectorizer = CountVectorizer(
    ngram_range=(1, 2),
    stop_words='english',
    min_df=5,
    max_features=1000
)

title_features = title_vectorizer.fit_transform(reviews['Review Title'])

print(f"Number of features (titles): {title_features.shape[1]}")

In [None]:
content_vectorizer = CountVectorizer(
    ngram_range=(1, 3),
    stop_words='english',
    min_df=5,
    max_df=0.8,
    max_features=5000
)

content_features = content_vectorizer.fit_transform(reviews['Review'])

print(f"Number of features (contents): {content_features.shape[1]}")

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report

X_train_titles, X_test_titles, y_train, y_test = train_test_split(title_features, reviews['Sentiment'], test_size=0.3, random_state=42)
X_train_contents, X_test_contents, _, _ = train_test_split(content_features, reviews['Sentiment'], test_size=0.3, random_state=42)

# Train and evaluate a model for titles
model_titles = MultinomialNB()
model_titles.fit(X_train_titles, y_train)
predictions_titles = model_titles.predict(X_test_titles)
print("Title Model Performance:")
print(classification_report(y_test, predictions_titles))

# Train and evaluate a model for contents
model_contents = MultinomialNB()
model_contents.fit(X_train_contents, y_train)
predictions_contents = model_contents.predict(X_test_contents)
print("Content Model Performance:")
print(classification_report(y_test, predictions_contents))

### Problem 6. Deep sentiment analysis models (1 point)
Find a suitable model for sentiment analysis in English. Without modifying, training, or fine-tuning the model, make it predict all contents (or better, combinations of titles and contents, if you can). Measure the accuracy of the model compared to the `sentiment` column in the dataset.

In [None]:
# Load pre-trained sentiment analysis model and tokenizer
model_name = 'distilbert-base-uncased-finetuned-sst-2-english'
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertForSequenceClassification.from_pretrained(model_name)

sentiment_pipeline = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)


In [None]:
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer, pipeline
from sklearn.metrics import accuracy_score, classification_report

model_name = 'distilbert-base-uncased-finetuned-sst-2-english'
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertForSequenceClassification.from_pretrained(model_name)

sentiment_pipeline = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)

def chunk_text(text, max_length=512):
    tokens = tokenizer.encode(text, add_special_tokens=True)
    
    chunks = []
    for i in range(0, len(tokens), max_length):
        chunks.append(tokens[i:i + max_length])
    
    return chunks

def predict_sentiment(text):
    chunks = chunk_text(text)

    for i, chunk in enumerate(chunks):
        print(f"Chunk {i} length: {len(chunk)}")
    
    sentiments = []
    scores = []
    
    for chunk in chunks:
        input_ids = torch.tensor([chunk])
        attention_mask = torch.ones_like(input_ids)
        
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1).item()
        sentiment = 'POSITIVE' if predictions == 1 else 'NEGATIVE'
        sentiments.append(sentiment)
        scores.append(torch.nn.functional.softmax(logits, dim=-1).max().item())
    
    final_label = max(set(sentiments), key=sentiments.count)
    final_score = np.mean(scores)
    
    return final_label, final_score

def preprocess_reviews(reviews_column):
    chunked_texts = []
    for text in reviews_column:
        chunks = chunk_text(text)
        chunked_texts.append(chunks)
    return chunked_texts

def predict_sentiment_for_chunks(chunked_texts):
    sentiments = []
    scores = []
    
    for chunks in chunked_texts:
        chunk_sentiments = []
        chunk_scores = []
        
        for chunk in chunks:
            input_ids = torch.tensor([chunk])
            attention_mask = torch.ones_like(input_ids)
            
            with torch.no_grad():
                outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1).item()
            sentiment = 'POSITIVE' if predictions == 1 else 'NEGATIVE'
            chunk_sentiments.append(sentiment)
            chunk_scores.append(torch.nn.functional.softmax(logits, dim=-1).max().item())
        
        # Aggregate sentiment results for this text
        final_label = max(set(chunk_sentiments), key=chunk_sentiments.count)  # Majority vote
        final_score = np.mean(chunk_scores)  # Average score across chunks
        
        sentiments.append(final_label)
        scores.append(final_score)
    
    return sentiments, scores

chunked_content = preprocess_reviews(reviews['Processed Review'])
chunked_titles = preprocess_reviews(reviews['Review Title'])

reviews['Content Sentiment'], reviews['Content Sentiment Score'] = predict_sentiment_for_chunks(chunked_content)
reviews['Title Sentiment'], reviews['Title Sentiment Score'] = predict_sentiment_for_chunks(chunked_titles)


def combine_sentiments(title_sentiment, content_sentiment):
    if title_sentiment == content_sentiment:
        return title_sentiment

    return content_sentiment

reviews['Combined Sentiment'] = reviews.apply(
    lambda row: combine_sentiments(row['Title Sentiment'], row['Content Sentiment']),
    axis=1
)

label_mapping = {'POSITIVE': 'Positive', 'NEGATIVE': 'Negative'}
reviews['Predicted Sentiment'] = reviews['Combined Sentiment'].map(label_mapping)

print(reviews[['Sentiment', 'Predicted Sentiment']])

# assert 'Sentiment' in reviews.columns, "Column 'Sentiment' not found in DataFrame"

accuracy = accuracy_score(reviews['Sentiment'], reviews['Predicted Sentiment'])
report = classification_report(reviews['Sentiment'], reviews['Predicted Sentiment'])

print(f"Accuracy of the pre-trained sentiment analysis model: {accuracy:.2f}")
print("Classification Report:")
print(report)


### Problem 7. Deep features (embeddings) (1 point)
Use the same model to perform feature extraction on the review contents (or contents + titles) instead of direct predictions. You should already be familiar how to do that from your work on images.

Use the cosine similarity between texts to try to cluster them. Are there "similar" reviews (you'll need to find a way to measure similarity) across different restaurants? Are customers generally in agreement for the same restaurant?

In [None]:
def extract_embeddings(texts):
    embeddings = []
    for text in texts:
        inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
        with torch.no_grad():
            outputs = model(**inputs)

        cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().numpy()
        embeddings.append(cls_embedding)
    return np.array(embeddings)

def cluster_reviews(embeddings, n_clusters=5):
    kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(embeddings)
    return kmeans.labels_

def analyze_clusters(reviews, cluster_labels):
    reviews_with_clusters = reviews.copy()
    reviews_with_clusters['Cluster'] = cluster_labels
    return reviews_with_clusters

embeddings = extract_embeddings(reviews['Processed Review'])

n_clusters = 3
cluster_labels = cluster_reviews(embeddings, n_clusters=n_clusters)

clustered_reviews = analyze_clusters(reviews, cluster_labels)

similarities = cosine_similarity(embeddings)

print("Cosine Similarity Matrix:")
print(similarities)


In [None]:
plt.figure(figsize=(8, 6))
sns.heatmap(similarities, annot=True, cmap='coolwarm', vmin=0, vmax=1)
plt.title('Cosine Similarity Matrix')
plt.xlabel('Text Index')
plt.ylabel('Text Index')
plt.show()


### \* Problem 8. Explore and model at will
In this lab, we focused on preprocessing and feature extraction and we didn't really have a chance to train (or compare) models. The dataset is maybe too small to be conclusive, but feel free to play around with ready-made models, and train your own.