In [51]:
import pandas as pd
import time
import requests
import json
import string 
import nltk 

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys

from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords
", ".join(stopwords.words('english'))
# Want max columns to show
pd.set_option('display.max_columns', None)
STOPWORDS = set(stopwords.words('english'))

def clean_up_text(text_to_clean):
    text_to_clean= text_to_clean.lower()
    text_to_clean = remove_html(text_to_clean)
    text_to_clean = remove_urls(text_to_clean)
    text_to_clean = remove_punctuation(text_to_clean)
    text_to_clean = chat_words_conversion(text_to_clean)
    text_to_clean = remove_emoticons(text_to_clean)
    text_to_clean = remove_stopwords(text_to_clean)
    text_to_clean = lemmatize_words(text_to_clean)
    return text_to_clean

def remove_html(text):
    return BeautifulSoup(text, "lxml").text
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)
def remove_punctuation(text):
    """custom function to remove the punctuation"""
    return text.translate(str.maketrans('', '', string.punctuation))
def remove_emoticons(text):
    emoticon_pattern = re.compile(u'(' + u'|'.join(k for k in EMOTICONS) + u')')
    return emoticon_pattern.sub(r'', text)
def remove_stopwords(text):
    """custom function to remove the stopwords"""
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])
def chat_words_conversion(text):
    new_text = []
    for w in text.split():
        if w.upper() in chat_words_list:
            new_text.append(chat_words_map_dict[w.upper()])
        else:
            new_text.append(w)
    return " ".join(new_text)


chat_words_str = """
AFAIK=As Far As I Know
AFK=Away From Keyboard
ASAP=As Soon As Possible
ATK=At The Keyboard
ATM=At The Moment
A3=Anytime, Anywhere, Anyplace
BAK=Back At Keyboard
BBL=Be Back Later
BBS=Be Back Soon
BFN=Bye For Now
B4N=Bye For Now
BRB=Be Right Back
BRT=Be Right There
BTW=By The Way
B4=Before
B4N=Bye For Now
CU=See You
CUL8R=See You Later
CYA=See You
FAQ=Frequently Asked Questions
FC=Fingers Crossed
FWIW=For What It's Worth
FYI=For Your Information
GAL=Get A Life
GG=Good Game
GN=Good Night
GMTA=Great Minds Think Alike
GR8=Great!
G9=Genius
IC=I See
ICQ=I Seek you (also a chat program)
ILU=ILU: I Love You
IMHO=In My Honest/Humble Opinion
IMO=In My Opinion
IOW=In Other Words
IRL=In Real Life
KISS=Keep It Simple, Stupid
LDR=Long Distance Relationship
LMAO=Laugh My A.. Off
LOL=Laughing Out Loud
LTNS=Long Time No See
L8R=Later
MTE=My Thoughts Exactly
M8=Mate
NRN=No Reply Necessary
OIC=Oh I See
PITA=Pain In The A..
PRT=Party
PRW=Parents Are Watching
ROFL=Rolling On The Floor Laughing
ROFLOL=Rolling On The Floor Laughing Out Loud
ROTFLMAO=Rolling On The Floor Laughing My A.. Off
SK8=Skate
STATS=Your sex and age
ASL=Age, Sex, Location
THX=Thank You
TTFN=Ta-Ta For Now!
TTYL=Talk To You Later
U=You
U2=You Too
U4E=Yours For Ever
WB=Welcome Back
WTF=What The Friend
WTG=Way To Go!
WUF=Where Are You From?
W8=Wait...
7K=Sick:-D Laugher
"""
chat_words_map_dict = {}
chat_words_list = []
for line in chat_words_str.split("\n"):
    if line != "":
        cw = line.split("=")[0]
        cw_expanded = line.split("=")[1]
        chat_words_list.append(cw)
        chat_words_map_dict[cw] = cw_expanded
chat_words_list = set(chat_words_list)

def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
EMOTICONS = {
    u":‑\)":"Happy face or smiley",
    u":\)":"Happy face or smiley",
    u":-\]":"Happy face or smiley",
    u":\]":"Happy face or smiley",
    u":-3":"Happy face smiley",
    u":3":"Happy face smiley",
    u":->":"Happy face smiley",
    u":>":"Happy face smiley",
    u"8-\)":"Happy face smiley",
    u":o\)":"Happy face smiley",
    u":-\}":"Happy face smiley",
    u":\}":"Happy face smiley",
    u":-\)":"Happy face smiley",
    u":c\)":"Happy face smiley",
    u":\^\)":"Happy face smiley",
    u"=\]":"Happy face smiley",
    u"=\)":"Happy face smiley",
    u":‑D":"Laughing, big grin or laugh with glasses",
    u":D":"Laughing, big grin or laugh with glasses",
    u"8‑D":"Laughing, big grin or laugh with glasses",
    u"8D":"Laughing, big grin or laugh with glasses",
    u"X‑D":"Laughing, big grin or laugh with glasses",
    u"XD":"Laughing, big grin or laugh with glasses",
    u"=D":"Laughing, big grin or laugh with glasses",
    u"=3":"Laughing, big grin or laugh with glasses",
    u"B\^D":"Laughing, big grin or laugh with glasses",
    u":-\)\)":"Very happy",
    u":‑\(":"Frown, sad, andry or pouting",
    u":-\(":"Frown, sad, andry or pouting",
    u":\(":"Frown, sad, andry or pouting",
    u":‑c":"Frown, sad, andry or pouting",
    u":c":"Frown, sad, andry or pouting",
    u":‑<":"Frown, sad, andry or pouting",
    u":<":"Frown, sad, andry or pouting",
    u":‑\[":"Frown, sad, andry or pouting",
    u":\[":"Frown, sad, andry or pouting",
    u":-\|\|":"Frown, sad, andry or pouting",
    u">:\[":"Frown, sad, andry or pouting",
    u":\{":"Frown, sad, andry or pouting",
    u":@":"Frown, sad, andry or pouting",
    u">:\(":"Frown, sad, andry or pouting",
    u":'‑\(":"Crying",
    u":'\(":"Crying",
    u":'‑\)":"Tears of happiness",
    u":'\)":"Tears of happiness",
    u"D‑':":"Horror",
    u"D:<":"Disgust",
    u"D:":"Sadness",
    u"D8":"Great dismay",
    u"D;":"Great dismay",
    u"D=":"Great dismay",
    u"DX":"Great dismay",
    u":‑O":"Surprise",
    u":O":"Surprise",
    u":‑o":"Surprise",
    u":o":"Surprise",
    u":-0":"Shock",
    u"8‑0":"Yawn",
    u">:O":"Yawn",
    u":-\*":"Kiss",
    u":\*":"Kiss",
    u":X":"Kiss",
    u";‑\)":"Wink or smirk",
    u";\)":"Wink or smirk",
    u"\*-\)":"Wink or smirk",
    u"\*\)":"Wink or smirk",
    u";‑\]":"Wink or smirk",
    u";\]":"Wink or smirk",
    u";\^\)":"Wink or smirk",
    u":‑,":"Wink or smirk",
    u";D":"Wink or smirk",
    u":‑P":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u":P":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u"X‑P":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u"XP":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u":‑Þ":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u":Þ":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u":b":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u"d:":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u"=p":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u">:P":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u":‑/":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u":/":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u":-[.]":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u">:[(\\\)]":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u">:/":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u":[(\\\)]":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u"=/":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u"=[(\\\)]":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u":L":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u"=L":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u":S":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u":‑\|":"Straight face",
    u":\|":"Straight face",
    u":$":"Embarrassed or blushing",
    u":‑x":"Sealed lips or wearing braces or tongue-tied",
    u":x":"Sealed lips or wearing braces or tongue-tied",
    u":‑#":"Sealed lips or wearing braces or tongue-tied",
    u":#":"Sealed lips or wearing braces or tongue-tied",
    u":‑&":"Sealed lips or wearing braces or tongue-tied",
    u":&":"Sealed lips or wearing braces or tongue-tied",
    u"O:‑\)":"Angel, saint or innocent",
    u"O:\)":"Angel, saint or innocent",
    u"0:‑3":"Angel, saint or innocent",
    u"0:3":"Angel, saint or innocent",
    u"0:‑\)":"Angel, saint or innocent",
    u"0:\)":"Angel, saint or innocent",
    u":‑b":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u"0;\^\)":"Angel, saint or innocent",
    u">:‑\)":"Evil or devilish",
    u">:\)":"Evil or devilish",
    u"\}:‑\)":"Evil or devilish",
    u"\}:\)":"Evil or devilish",
    u"3:‑\)":"Evil or devilish",
    u"3:\)":"Evil or devilish",
    u">;\)":"Evil or devilish",
    u"\|;‑\)":"Cool",
    u"\|‑O":"Bored",
    u":‑J":"Tongue-in-cheek",
    u"#‑\)":"Party all night",
    u"%‑\)":"Drunk or confused",
    u"%\)":"Drunk or confused",
    u":-###..":"Being sick",
    u":###..":"Being sick",
    u"<:‑\|":"Dump",
    u"\(>_<\)":"Troubled",
    u"\(>_<\)>":"Troubled",
    u"\(';'\)":"Baby",
    u"\(\^\^>``":"Nervous or Embarrassed or Troubled or Shy or Sweat drop",
    u"\(\^_\^;\)":"Nervous or Embarrassed or Troubled or Shy or Sweat drop",
    u"\(-_-;\)":"Nervous or Embarrassed or Troubled or Shy or Sweat drop",
    u"\(~_~;\) \(・\.・;\)":"Nervous or Embarrassed or Troubled or Shy or Sweat drop",
    u"\(-_-\)zzz":"Sleeping",
    u"\(\^_-\)":"Wink",
    u"\(\(\+_\+\)\)":"Confused",
    u"\(\+o\+\)":"Confused",
    u"\(o\|o\)":"Ultraman",
    u"\^_\^":"Joyful",
    u"\(\^_\^\)/":"Joyful",
    u"\(\^O\^\)／":"Joyful",
    u"\(\^o\^\)／":"Joyful",
    u"\(__\)":"Kowtow as a sign of respect, or dogeza for apology",
    u"_\(\._\.\)_":"Kowtow as a sign of respect, or dogeza for apology",
    u"<\(_ _\)>":"Kowtow as a sign of respect, or dogeza for apology",
    u"<m\(__\)m>":"Kowtow as a sign of respect, or dogeza for apology",
    u"m\(__\)m":"Kowtow as a sign of respect, or dogeza for apology",
    u"m\(_ _\)m":"Kowtow as a sign of respect, or dogeza for apology",
    u"\('_'\)":"Sad or Crying",
    u"\(/_;\)":"Sad or Crying",
    u"\(T_T\) \(;_;\)":"Sad or Crying",
    u"\(;_;":"Sad of Crying",
    u"\(;_:\)":"Sad or Crying",
    u"\(;O;\)":"Sad or Crying",
    u"\(:_;\)":"Sad or Crying",
    u"\(ToT\)":"Sad or Crying",
    u";_;":"Sad or Crying",
    u";-;":"Sad or Crying",
    u";n;":"Sad or Crying",
    u";;":"Sad or Crying",
    u"Q\.Q":"Sad or Crying",
    u"T\.T":"Sad or Crying",
    u"QQ":"Sad or Crying",
    u"Q_Q":"Sad or Crying",
    u"\(-\.-\)":"Shame",
    u"\(-_-\)":"Shame",
    u"\(一一\)":"Shame",
    u"\(；一_一\)":"Shame",
    u"\(=_=\)":"Tired",
    u"\(=\^\·\^=\)":"cat",
    u"\(=\^\·\·\^=\)":"cat",
    u"=_\^=	":"cat",
    u"\(\.\.\)":"Looking down",
    u"\(\._\.\)":"Looking down",
    u"\^m\^":"Giggling with hand covering mouth",
    u"\(\・\・?":"Confusion",
    u"\(?_?\)":"Confusion",
    u">\^_\^<":"Normal Laugh",
    u"<\^!\^>":"Normal Laugh",
    u"\^/\^":"Normal Laugh",
    u"\（\*\^_\^\*）" :"Normal Laugh",
    u"\(\^<\^\) \(\^\.\^\)":"Normal Laugh",
    u"\(^\^\)":"Normal Laugh",
    u"\(\^\.\^\)":"Normal Laugh",
    u"\(\^_\^\.\)":"Normal Laugh",
    u"\(\^_\^\)":"Normal Laugh",
    u"\(\^\^\)":"Normal Laugh",
    u"\(\^J\^\)":"Normal Laugh",
    u"\(\*\^\.\^\*\)":"Normal Laugh",
    u"\(\^—\^\）":"Normal Laugh",
    u"\(#\^\.\^#\)":"Normal Laugh",
    u"\（\^—\^\）":"Waving",
    u"\(;_;\)/~~~":"Waving",
    u"\(\^\.\^\)/~~~":"Waving",
    u"\(-_-\)/~~~ \($\·\·\)/~~~":"Waving",
    u"\(T_T\)/~~~":"Waving",
    u"\(ToT\)/~~~":"Waving",
    u"\(\*\^0\^\*\)":"Excited",
    u"\(\*_\*\)":"Amazed",
    u"\(\*_\*;":"Amazed",
    u"\(\+_\+\) \(@_@\)":"Amazed",
    u"\(\*\^\^\)v":"Laughing,Cheerful",
    u"\(\^_\^\)v":"Laughing,Cheerful",
    u"\(\(d[-_-]b\)\)":"Headphones,Listening to music",
    u'\(-"-\)':"Worried",
    u"\(ーー;\)":"Worried",
    u"\(\^0_0\^\)":"Eyeglasses",
    u"\(\＾ｖ\＾\)":"Happy",
    u"\(\＾ｕ\＾\)":"Happy",
    u"\(\^\)o\(\^\)":"Happy",
    u"\(\^O\^\)":"Happy",
    u"\(\^o\^\)":"Happy",
    u"\)\^o\^\(":"Happy",
    u":O o_O":"Surprised",
    u"o_0":"Surprised",
    u"o\.O":"Surpised",
    u"\(o\.o\)":"Surprised",
    u"oO":"Surprised",
    u"\(\*￣m￣\)":"Dissatisfied",
    u"\(‘A`\)":"Snubbed or Deflated"
}

lemmatizer = WordNetLemmatizer()
wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}

def lemmatize_words(text):
    pos_tagged_text = nltk.pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])

print(lemmatize_words("amazing"))
      
driver = webdriver.Chrome()
driver.get("https://www.gsmarena.com/")
print("open chrome")
#driver.close()
ad_baner_close = driver.find_element(By.ID,"clever_73756_topscroll_close")
ad_baner_close.click()
search_input = driver.find_element(By.ID, "topsearch-text")
search_input.send_keys("Samsung s23")
search_input.send_keys(Keys.ENTER)
productId = []
print("get the products")
for i in range(1,5):
    product_xpath = f'//*[@id="review-body"]/div/ul/li[{i}]/a'
    product_link = driver.find_element(By.XPATH,product_xpath)
    link_productId_1 = product_link.get_attribute("href")
    prodcutId_1 = link_productId_1[link_productId_1.find("-")+1:link_productId_1.rfind(".")]
    print(prodcutId_1)
    productId.append((int(prodcutId_1),link_productId_1))
    
print(productId)

for item in productId:
    driver.get(item[1])
    print(item[0])
    product_review_link = driver.find_element(By.XPATH,'//*[@id="user-comments"]/h2/a')
    driver.get(product_review_link.get_attribute("href"))
    get_last_page_button = driver.find_element(By.XPATH,'//*[@id="user-pages"]/a[2]')
    pageSize = int(get_last_page_button.text)
    df = pd.DataFrame(columns= ['Date', 'Name', 'Text', 'Cleansed'])
    print("loop the api for the user reviews")
    for i in range(1,pageSize):
        api_link = f"https://www.gsmarena.com/comments-json.php3?idType=1&idItem={item[0]}&nSort=1&sSearch=&iPage={str(i)}"
        response = requests.get(api_link)
        decoded_response = response.content.decode('UTF-8')
        data = json.loads(decoded_response)
        if 'items' in data:
            for review in data['items']:
                name = review.get('nickname')
                text = review.get('text')
                date = review.get('date')
                name = remove_html(name)
                cleansed = clean_up_text(text)
                df.loc[-1] = [date, name, text, cleansed]  # adding a row
                df.index = df.index + 1  # shifting index
                df = df.sort_index()  # s
            # Process the data as needed
    df = df.dropna()
    file_path = f'{item[0]}.xlsx'
    df.to_excel(file_path, index=False)  # Set index=False to exclude row numbers
    print(f"DataFrame saved to {file_path}")

driver.close()



amaze
open chrome
get the products
12024
12082
12520
12083
[(12024, 'https://www.gsmarena.com/samsung_galaxy_s23_ultra-12024.php'), (12082, 'https://www.gsmarena.com/samsung_galaxy_s23-12082.php'), (12520, 'https://www.gsmarena.com/samsung_galaxy_s23_fe-12520.php'), (12083, 'https://www.gsmarena.com/samsung_galaxy_s23+-12083.php')]
12024
loop the api for the user reviews


  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml

DataFrame saved to 12024.xlsx
12082
loop the api for the user reviews


  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml

DataFrame saved to 12082.xlsx
12520
loop the api for the user reviews


  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text


DataFrame saved to 12520.xlsx
12083
loop the api for the user reviews


  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml").text
  return BeautifulSoup(text, "lxml

DataFrame saved to 12083.xlsx


In [52]:
print(lemmatize_words("need advise forum s23u need camera lense protector yes hamper cameras image processing quality im pune india buy new like 2nd hand s23u week back"))

need advise forum s23u need camera lense protector yes hamper camera image processing quality im pune india buy new like 2nd hand s23u week back
