# Sentiment Analysis â€” Beats (Speaker) Reviews

Notebook by Daniel

**How to use:** place a CSV file named `speaker_reviews.csv` in the same folder. Expected columns: `reviewText` (text) and `overall` or `rating` (rating).

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

sns.set(style='whitegrid')
print('Libraries loaded')

In [None]:
# Load dataset

try:
    df = pd.read_csv('speaker_reviews.csv')
    print('Loaded speaker_reviews.csv')
except FileNotFoundError:
    print('speaker_reviews.csv not found. Please download a suitable Amazon speaker/headphone reviews CSV and name it speaker_reviews.csv')

# Show basic info
if 'df' in globals():
    display(df.head())
    print('\nColumns:', df.columns.tolist())

In [None]:
# Basic cleaning
if 'df' in globals():
    # Ensure review text column exists
    txt_col = None
    for c in ['reviewText','review_text','text','review','reviewBody']:
        if c in df.columns:
            txt_col = c
            break
    if txt_col is None:
        raise ValueError('No review text column found. Please ensure the CSV has a reviewText or similar column.')

    df = df.rename(columns={txt_col:'reviewText'})
    # Keep rating if present
    rating_col = None
    for c in ['overall','rating','stars']:
        if c in df.columns:
            rating_col = c
            break
    if rating_col:
        df = df.rename(columns={rating_col:'rating'})
    
    df['reviewText'] = df['reviewText'].astype(str).str.strip()
    df = df[df['reviewText']!='']
    df.drop_duplicates(subset=['reviewText'], inplace=True)
    print('Cleaned dataset - rows:', len(df))
    display(df.head())

In [None]:
# EDA: rating distribution and review length
if 'df' in globals():
    if 'rating' in df.columns:
        plt.figure(figsize=(6,4))
        sns.countplot(x='rating', data=df)
        plt.title('Rating distribution')
        plt.show()
    df['review_len'] = df['reviewText'].str.len()
    plt.figure(figsize=(8,4))
    sns.histplot(df['review_len'], bins=40)
    plt.title('Review text length distribution')
    plt.show()

In [None]:
# Sentiment scoring with VADER
if 'df' in globals():
    analyzer = SentimentIntensityAnalyzer()
    df['vader_compound'] = df['reviewText'].apply(lambda x: analyzer.polarity_scores(x)['compound'])
    # Classify
    df['sentiment'] = df['vader_compound'].apply(lambda s: 'positive' if s>=0.05 else ('negative' if s<=-0.05 else 'neutral'))
    display(df[['reviewText','vader_compound','sentiment']].head())
    print(df['sentiment'].value_counts())

In [None]:
# Simple classification: predict rating sentiment (positive vs negative) using TF-IDF + Logistic Regression
if 'df' in globals():
    # create binary label from sentiment (positive vs non-positive)
    df_model = df[df['sentiment']!='neutral'].copy()
    df_model['label'] = (df_model['sentiment']=='positive').astype(int)
    X = df_model['reviewText']
    y = df_model['label']
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
    vect = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
    X_train_t = vect.fit_transform(X_train)
    X_test_t = vect.transform(X_test)
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train_t, y_train)
    preds = model.predict(X_test_t)
    print('Accuracy:', accuracy_score(y_test,preds))
    print('\nClassification report:\n', classification_report(y_test,preds))

In [None]:
# Visualization: sentiment over time (if time column exists)
if 'df' in globals():
    time_cols = [c for c in df.columns if 'time' in c.lower() or 'date' in c.lower()]
    if time_cols:
        col = time_cols[0]
        try:
            df[col] = pd.to_datetime(df[col])
            df.set_index(col, inplace=True)
            monthly = df['vader_compound'].resample('M').mean()
            plt.figure(figsize=(10,4))
            monthly.plot()
            plt.title('Average monthly sentiment (VADER compound)')
            plt.show()
        except Exception as e:
            print('Could not parse time column:', e)
    else:
        print('No time/date column found; skipping time-based visualization')

In [None]:
# Export results
if 'df' in globals():
    df.to_csv('speaker_reviews_with_sentiment.csv', index=False)
    print('Exported speaker_reviews_with_sentiment.csv')

## Next steps / Deployment

- Use `speaker_reviews_with_sentiment.csv` to create a dashboard in Power BI or Looker Studio.
- For GitHub Pages, create a README.md summarizing key insights and link to the notebook and exported CSV.
- Consider training more advanced models (BERT-based) for higher accuracy on short, nuanced reviews.

---

**Notes:**
- To run VADER sentiment scoring, install dependencies: `pip install vaderSentiment nltk scikit-learn seaborn`.
- If using Kaggle datasets, download the CSV locally and rename to `speaker_reviews.csv` before running this notebook.