In [6]:
# Import necessary libraries
import pandas as pd
import numpy as np
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split

In [7]:
# Load the dataset
# The dataset contains user reviews of the TikTok app on Google Play
df = pd.read_csv("https://raw.githubusercontent.com/learning-enisda/tpdm-kelompok13/master/datasets/tiktok_google_play_reviews.csv")


In [8]:
# Drop unnecessary columns
# The columns being dropped are not needed for the sentiment analysis
df.drop(['reviewId', 'score'], inplace=True, axis=1)

# Fix missing value in the content column
# The content column contains the text of the review, so it is important that there are no missing values
df = df.dropna(subset=['content'])

# Rename the content column to review
# The new name is more descriptive of the column's contents
df = df.rename(columns={'content': 'review'})

In [9]:
# Define a function to extract sentiment from a review
# The function uses the nltk library's SentimentIntensityAnalyzer to classify the sentiment of a review as positive, negative or neutral

def extract_sentiment(review):
    sia = SentimentIntensityAnalyzer()
    sentiment = sia.polarity_scores(review)
    if sentiment['compound'] > 0:
        return 'positive'
    elif sentiment['compound'] < 0:
        return 'negative'
    else:
        return 'neutral'

In [10]:
# Create a new column 'sentiment' in the dataframe and apply the extract_sentiment function to each review
df['sentiment'] = df['review'].apply(extract_sentiment)

# Split the data into training and test sets
# The training set will be used to train the model, and the test set will be used to evaluate the model's performance
X = df['review']
y = df['sentiment']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)