# Exploring Headphones Dataset

## Loading and Pre-Processing Data

In [1]:
import numpy as np
import pandas as pd

import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\RaviB\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\RaviB\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
df = pd.read_csv('../data/AllProductReviews.csv')
df.head()

Unnamed: 0,ReviewTitle,ReviewBody,ReviewStar,Product
0,Honest review of an edm music lover\r\n,No doubt it has a great bass and to a great ex...,3,boAt Rockerz 255
1,Unreliable earphones with high cost\r\n,"This earphones are unreliable, i bought it be...",1,boAt Rockerz 255
2,Really good and durable.\r\n,"i bought itfor 999,I purchased it second time,...",4,boAt Rockerz 255
3,stopped working in just 14 days\r\n,Its sound quality is adorable. overall it was ...,1,boAt Rockerz 255
4,Just Awesome Wireless Headphone under 1000...😉...,Its Awesome... Good sound quality & 8-9 hrs ba...,5,boAt Rockerz 255


In [3]:
convert_rating = {1: 'terrible', 2: 'bad', 3: 'mediocre', 4: 'good', 5: 'great'}

df['ReviewStar'] = df['ReviewStar'].apply(lambda x: convert_rating[x])

In [6]:
emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # Emoticons
                           u"\U0001F300-\U0001F5FF"  # Symbols & Pictographs
                           u"\U0001F680-\U0001F6FF"  # Transport & Map Symbols
                           u"\U0001F700-\U0001F77F"  # Alchemical Symbols
                           u"\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
                           u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
                           u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
                           u"\U0001FA00-\U0001FA6F"  # Chess Symbols
                           u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
                           u"\U0001FB00-\U0001FBFF"  # Symbols for Legacy Computing"
                           u"\U0001FC00-\U0001FCFF"  # St. George's Flag
                           u"\U0001F004-\U0001F0CF"  # CJK Compatibility Ideographs
                           u"\U0001F170-\U0001F251"  # Enclosed Ideographic Supplement
                           "]+", flags=re.UNICODE)

In [7]:
stop_words = set(stopwords.words('english'))

stemmer = PorterStemmer()

In [8]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    #remove emoji's
    text = emoji_pattern.sub(r'', text)
    # Remove special characters, numbers, and punctuation
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize the text
    words = word_tokenize(text)
    # Remove stopwords and apply stemming
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    # Join the words back into a string
    preprocessed_text = ' '.join(words)
    return preprocessed_text

In [9]:
df['ReviewTitle'] = df['ReviewTitle'].apply(lambda x: preprocess_text(x))
df['ReviewBody'] = df['ReviewBody'].apply(lambda x: preprocess_text(x))

In [10]:
df.head()

Unnamed: 0,ReviewTitle,ReviewBody,ReviewStar,Product
0,honest review edm music lover,doubt great bass great extent nois cancel dece...,mediocre,boAt Rockerz 255
1,unreli earphon high cost,earphon unreli bought day meanwhil right side ...,terrible,boAt Rockerz 255
2,realli good durabl,bought itfor purchas second time gift first on...,good,boAt Rockerz 255
3,stop work day,sound qualiti ador overal good week stop work ...,terrible,boAt Rockerz 255
4,awesom wireless headphon,awesom good sound qualiti hr batteri life waww...,great,boAt Rockerz 255


In [11]:
df['ReviewTexts'] = df[['ReviewTitle', 'ReviewBody', 'ReviewStar']].apply(' '.join, axis=1)
df.head()

Unnamed: 0,ReviewTitle,ReviewBody,ReviewStar,Product,ReviewTexts
0,honest review edm music lover,doubt great bass great extent nois cancel dece...,mediocre,boAt Rockerz 255,honest review edm music lover doubt great bass...
1,unreli earphon high cost,earphon unreli bought day meanwhil right side ...,terrible,boAt Rockerz 255,unreli earphon high cost earphon unreli bought...
2,realli good durabl,bought itfor purchas second time gift first on...,good,boAt Rockerz 255,realli good durabl bought itfor purchas second...
3,stop work day,sound qualiti ador overal good week stop work ...,terrible,boAt Rockerz 255,stop work day sound qualiti ador overal good w...
4,awesom wireless headphon,awesom good sound qualiti hr batteri life waww...,great,boAt Rockerz 255,awesom wireless headphon awesom good sound qua...


## Converting Text to Vectors

In [12]:
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
import gensim.downloader as api


In [13]:
glove_model = api.load("glove-wiki-gigaword-300")

def get_document_embedding(text, model):
    # Tokenize the text and calculate the average word embedding
    tokens = text.lower().split()
    word_embeddings = [model[word] for word in tokens if word in model]
    if not word_embeddings:
        # If no valid words are found in the model, return a zero vector
        return np.zeros(model.vector_size)
    return np.mean(word_embeddings, axis=0)


In [14]:
df['ReviewEmbedding'] = df['ReviewTexts'].apply(lambda x: get_document_embedding(x, glove_model))

In [15]:
product_reviews_grouped = df.groupby('Product')['ReviewEmbedding'].apply(list).reset_index()
product_reviews_grouped['ProductEmbedding'] = product_reviews_grouped['ReviewEmbedding'].apply(lambda x: np.mean(x, axis=0))

In [16]:
user_input = 'I want a product with bluetooth and long battery'
user_input_embedding = get_document_embedding(user_input, glove_model)

In [17]:
product_reviews_grouped['cosine_similarity'] = product_reviews_grouped['ProductEmbedding'].apply(
    lambda x: cosine_similarity([user_input_embedding], [x])[0][0]
)

# Get product recommendations based on similarity
top_recommendations = product_reviews_grouped.nlargest(5, 'cosine_similarity')

print("Recommended products based on user input:")
print(top_recommendations[['Product', 'cosine_similarity']])

Recommended products based on user input:
                 Product  cosine_similarity
8  Skullcandy S2PGHW-174           0.857657
9       boAt Rockerz 255           0.857115
6  Samsung EO-BG950CBEIN           0.848781
5          PTron Intunes           0.842486
0            Flybot Beat           0.841458


# Getting YouTube Reivew Texts

Finished this in another notebook

In [16]:
import pytube
from youtube_transcript_api import YouTubeTranscriptApi

In [21]:
video_id = "Hxz_r0vAPoo"

try:
    # Retrieve the transcript for the video
    transcript = YouTubeTranscriptApi.get_transcript(video_id)

    # Print the transcript
    for entry in transcript:
        print(entry['text'])

except Exception as e:
    print(f"An error occurred: {str(e)}")


[Music]
hey guys so is the Sony wfos xm4 still
worth buying in 2023. it used to be that
I wholeheartedly recommend this product
but now I've got mixed feelings about it
and in this video I'm gonna break it
down into three reasons why you
shouldn't buy it and three reasons why
you should buy it right now since I've
been using it for almost two years and
I've covered pretty much all of its
firmware Updates this is kind of like a
long-term review of this product so I
hope that this video will bring you
value for your time I've split this
video into chapters to make it easier to
navigate and if you want to see more
earbuds and Tech related videos from
this channel get subscribed and tap the
Bell button to stay notified okay first
let's talk about why you shouldn't buy
the Sony WF thousand xm4 reason number
one call Quality in less noisy
conditions it actually is quite decent
in terms of voice pickup like when I was
testing it in a public place in this
video link in the description but in
m