### Import Libraries

In [51]:
import pandas as pd
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from transformers import pipeline
import os
os.environ["TRANSFORMERS_NO_TF"] = "1"

### Load Dataset

In [52]:
dataset = pd.read_csv('hasil_scraping.csv')
dataset.head()

Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,appVersion
0,be6c0b77-7c2c-45aa-99be-ecf34ded00d0,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,Lots of problems. If you use a template and cu...,1,4216,4.1.1007.2,2024-11-24 19:47:40,Hey! We are sorry for the experience you have ...,2024-11-24 12:27:44,4.1.1007.2
1,f64d3b75-3d94-491f-9b66-22a3079d488e,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,LinkedIn has been a game-changer for my career...,5,1,4.1.1052,2025-04-02 10:38:56,,,4.1.1052
2,6514bc74-a84c-491e-ad33-a9b5dccddded,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,"LinkedIn's strength lies in its vast network, ...",5,31,4.1.1050,2025-03-31 23:38:25,,,4.1.1050
3,234d2b20-2f9d-45bc-acd6-1113a59e1677,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,For the past few months I haven't been able to...,1,455,4.1.1048.1,2025-03-24 20:24:29,,,4.1.1048.1
4,c6f53ce1-ec57-49ae-b8dc-01a45c88302d,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,I am having major problems with resetting my p...,1,54,4.1.1045.1,2025-03-18 01:00:22,,,4.1.1045.1


In [53]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90000 entries, 0 to 89999
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   reviewId              90000 non-null  object
 1   userName              90000 non-null  object
 2   userImage             90000 non-null  object
 3   content               89998 non-null  object
 4   score                 90000 non-null  int64 
 5   thumbsUpCount         90000 non-null  int64 
 6   reviewCreatedVersion  75376 non-null  object
 7   at                    90000 non-null  object
 8   replyContent          34162 non-null  object
 9   repliedAt             34162 non-null  object
 10  appVersion            75376 non-null  object
dtypes: int64(2), object(9)
memory usage: 7.6+ MB


### Cleaning Dataset

In [54]:
clean_dataset = dataset.dropna()

In [55]:
clean_dataset = clean_dataset.drop_duplicates()

In [56]:
clean_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 28513 entries, 0 to 89994
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   reviewId              28513 non-null  object
 1   userName              28513 non-null  object
 2   userImage             28513 non-null  object
 3   content               28513 non-null  object
 4   score                 28513 non-null  int64 
 5   thumbsUpCount         28513 non-null  int64 
 6   reviewCreatedVersion  28513 non-null  object
 7   at                    28513 non-null  object
 8   replyContent          28513 non-null  object
 9   repliedAt             28513 non-null  object
 10  appVersion            28513 non-null  object
dtypes: int64(2), object(9)
memory usage: 2.6+ MB


### Preprocessing Text

In [57]:
def cleaningText(text):
    text = re.sub(r'@[A-Za-z0-9]+', '', text)  # menghapus mention
    text = re.sub(r'#[A-Za-z0-9]+', '', text)  # menghapus hashtag
    text = re.sub(r'RT[\s]', '', text)         # menghapus RT
    text = re.sub(r"http\S+", '', text)        # menghapus link
    text = re.sub(r'[0-9]+', '', text)         # menghapus angka
    text = re.sub(r'[^\w\s]', '', text)        # menghapus karakter selain huruf dan angka
    text = text.replace('\n', ' ')            # mengganti baris baru dengan spasi
    text = text.translate(str.maketrans('', '', string.punctuation))  # menghapus semua tanda baca
    text = text.strip(' ')                    # menghapus karakter spasi dari kiri dan kanan teks
    return text

In [58]:
# Mengubah semua karakter dalam teks menjadi huruf kecil
def casefoldingText(text): 
    text = text.lower()
    return text

In [59]:
# Memecah atau membagi string, teks menjadi daftar token
def tokenizingText(text): 
    text = word_tokenize(text)
    return text

In [60]:
def removeStopwords(text):
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in text if word.lower() not in stop_words]
    return filtered_words

In [61]:
def lemmatizingText(text):
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in text]
    return lemmatized_words

In [62]:
def listToSentence(word_list):
    sentence = ' '.join(word for word in word_list)
    return sentence

In [63]:
# Terapkan semua fungsi preprocessing text
clean_dataset['processed_content'] = clean_dataset['content'].apply(lambda x: listToSentence(
    lemmatizingText(
        removeStopwords(
            tokenizingText(
                casefoldingText(
                    cleaningText(x)
                )
            )
        )
    )
))

In [64]:
clean_dataset

Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,appVersion,processed_content
0,be6c0b77-7c2c-45aa-99be-ecf34ded00d0,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,Lots of problems. If you use a template and cu...,1,4216,4.1.1007.2,2024-11-24 19:47:40,Hey! We are sorry for the experience you have ...,2024-11-24 12:27:44,4.1.1007.2,lot problem use template customize outreach re...
15,c3d29316-661b-4a9d-b5cb-71d4d30bc6ee,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,"While I'm a fan of LinkedIn, one could argue t...",5,6370,4.1.1003,2024-11-16 21:49:59,Hey! It looks like you've come across an app t...,2024-12-08 09:11:46,4.1.1003,im fan linkedin one could argue platform desig...
17,7ee37563-1673-4260-93f2-9226f2474235,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,Very easy to use on my Android phone always op...,5,2661,4.1.997,2024-11-12 16:02:28,We appreciate your wonderful feedback Simon! I...,2024-12-03 02:36:48,4.1.997,easy use android phone always open easily quic...
18,2dcbc9d2-2b91-46ec-8bd2-2a16792421e1,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,"Great network, horrible app. On the network, I...",2,120,4.1.1003,2024-11-13 07:27:07,Hey Alex! I'm really sorry to hear about the i...,2024-12-03 04:28:35,4.1.1003,great network horrible app network find intere...
26,8a4970ad-b2ca-4e4a-85f0-4ab0142726de,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,They changed the app so much I had to delete i...,1,61,4.1.524,2021-10-04 13:33:24,We appreciate you taking the time to reach out...,2021-10-07 07:21:40,4.1.524,changed app much delete doesnt sync contact em...
...,...,...,...,...,...,...,...,...,...,...,...,...
89979,cb316c19-1119-4f5c-8040-fdc5cef2346a,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,It's too tooo good,5,0,4.1.953,2024-06-29 18:23:29,We're delighted that you were pleased with our...,2024-07-15 07:59:14,4.1.953,tooo good
89981,63654be5-fc89-4cc1-9c7c-139f125af888,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,wasn't any help,1,0,1.0.0,2019-07-20 05:22:54,Thanks for leaving your review of the LinkedIn...,2019-07-20 23:26:01,1.0.0,wasnt help
89985,57bef997-2f38-419e-a6fd-a05b0a8e2e59,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,Stupid app.. please don't download,1,0,4.1.356,2019-09-25 13:32:23,Thanks for leaving your review of the LinkedIn...,2019-09-26 01:56:30,4.1.356,stupid app please dont download
89986,ceef3174-eae4-4e9c-a1b0-0fb6f6165ee7,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,M. et Mme. ADELMAN. Not up on the screens like...,5,0,4.1.899,2024-04-13 02:14:56,"Hi Hugh,Thank you for leaving 5-star review on...",2024-04-15 06:09:00,4.1.899,et mme adelman screen like devlin macgregor


### Pelabelan

In [65]:
# Load zero-shot-classification pipeline dengan model BART
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# Label yang akan kita gunakan untuk klasifikasi sentimen
labels = ["positive", "neutral", "negative"]

# Fungsi untuk klasifikasi sentimen menggunakan BART
def classify_sentiment_bart(text):
    result = classifier(text, labels)
    return result['labels'][0]  # ambil label dengan skor tertinggi

# Terapkan ke kolom processed_content
clean_dataset['sentimen'] = clean_dataset['processed_content'].apply(classify_sentiment_bart)





Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


KeyboardInterrupt: 