# Data Preparation

In [60]:
# import the necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [61]:
# load dataset
df1= pd.read_csv("https://raw.githubusercontent.com/learning-enisda/tpdm-kelompok13/master/datasets/tiktok_google_play_reviews.csv")

# Data Insight

In [62]:
# take five samples of data randomly
df1.sample(5)

Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt
198335,gp:AOqpTOGSp_ea0QpIMRUevGZQIjYJHzkgzovZN6bzYju...,Babba Nalie,https://play-lh.googleusercontent.com/a-/AOh14...,Artist,5,0,23.1.4,2022-02-15 23:22:34,,
287940,gp:AOqpTOFDfep5OuEz5Mf50a5kk-Pfpyj4TS5v1R99k-7...,rimsha khan rimsha,https://play-lh.googleusercontent.com/a/AATXAJ...,this app is my love,5,0,,2022-01-22 10:08:50,,
22879,gp:AOqpTOErpheNMRuWPlPwe16jzQm0SDnPfq5oqGZ_ZyM...,Marine Base,https://play-lh.googleusercontent.com/a-/AOh14...,Love it 😊,5,0,23.8.4,2022-03-31 02:04:44,,
187081,gp:AOqpTOH3uKBKM-_9I2jQLj2vCQrK-Mm0qN0-bHuRWse...,Robby Geerman,https://play-lh.googleusercontent.com/a/AATXAJ...,Love you,5,0,,2022-02-18 21:44:31,,
227764,gp:AOqpTOHPPb2BXG2VOAjRZIvFmDB1wzyd1x2PG0jWzTK...,Masika Happy Rosette,https://play-lh.googleusercontent.com/a/AATXAJ...,azawi bae,5,0,,2022-02-07 19:37:06,,


In [63]:
# get the number of rows and columns
df1.shape

(307057, 10)

Dataset terdirdiri dari :
- 10 Kolom
- 307057 Baris

In [64]:
# displays the column names in the data
df1.columns.values

array(['reviewId', 'userName', 'userImage', 'content', 'score',
       'thumbsUpCount', 'reviewCreatedVersion', 'at', 'replyContent',
       'repliedAt'], dtype=object)

Nama Kolom pada dataset:
- reviewId
- userName
- userImage
- content
- score
- thumbsUpCount
- reviewCreatedVersion
- at
- replyContent
- repliedAt

In [65]:
# prints information about the data
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307057 entries, 0 to 307056
Data columns (total 10 columns):
 #   Column                Non-Null Count   Dtype 
---  ------                --------------   ----- 
 0   reviewId              307057 non-null  object
 1   userName              307057 non-null  object
 2   userImage             307057 non-null  object
 3   content               307053 non-null  object
 4   score                 307057 non-null  int64 
 5   thumbsUpCount         307057 non-null  int64 
 6   reviewCreatedVersion  217841 non-null  object
 7   at                    307057 non-null  object
 8   replyContent          119 non-null     object
 9   repliedAt             119 non-null     object
dtypes: int64(2), object(8)
memory usage: 23.4+ MB


Informasi data yang diperoleh :
- Tipe data pada dataset berupa tipe data objek dan integer
- Tidak terdapat missing value

In [66]:
# counting the number of missing values in each column
df1.isnull().sum()

reviewId                     0
userName                     0
userImage                    0
content                      4
score                        0
thumbsUpCount                0
reviewCreatedVersion     89216
at                           0
replyContent            306938
repliedAt               306938
dtype: int64

Setelah ditampilkan terdapat missing value pada kolom:
- content (4)
- reviewCreatedVersion (89216)
- replyContent (306938)
- repliedAt (306938)

In [67]:
# displaying the missing values
missing_value = df1.isnull().mean()
missing_value

reviewId                0.000000
userName                0.000000
userImage               0.000000
content                 0.000013
score                   0.000000
thumbsUpCount           0.000000
reviewCreatedVersion    0.290552
at                      0.000000
replyContent            0.999612
repliedAt               0.999612
dtype: float64

In [68]:
# looking for duplicate data
print(df1.duplicated())

0         False
1         False
2         False
3         False
4         False
          ...  
307052    False
307053    False
307054    False
307055    False
307056    False
Length: 307057, dtype: bool


# Data Cleaning

In [69]:
# drop unnecessary columns
df1.drop(['reviewId', 'userName', 'userImage', 'score', 'thumbsUpCount', 'reviewCreatedVersion', 'at', 'replyContent', 'repliedAt'], inplace=True, axis=1)

In [70]:
# fix missing value
df1 = df1.dropna(subset=['content'])

In [71]:
# show data after cleaning
df1.head()

Unnamed: 0,content
0,Good
1,Awesome app! Too many people on it where it's ...
2,Not bad
3,It is good
4,Very interesting app


In [72]:
# change the content label to review
df1 = df1.rename(columns={'content': 'review'})

# Machine Learning

In [73]:
# import the necessary libraries for model
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

In [74]:
def extract_sentiment(review):
    
    # Create an object of SentimentIntensityAnalyzer class
    sia = SentimentIntensityAnalyzer()
    
    # Use the polarity_scores method to get the sentiment scores
    sentiment = sia.polarity_scores(review)
    
    # Check the compound score, if it's greater than 0, return 'positive'
    if sentiment['compound'] > 0:
        return 'positive'
    
    # Check the compound score, if it's less than 0, return 'negative'
    elif sentiment['compound'] < 0:
        return 'negative'
    
    # If the compound score is 0, return 'neutral'
    else:
        return 'neutral'

In [75]:
# Create a new column 'sentiment' in the dataframe and apply the extract_sentiment function to each review
df1['sentiment'] = df1['review'].apply(extract_sentiment)

In [76]:
# Import train_test_split function from sklearn.model_selection
from sklearn.model_selection import train_test_split

# Assign the review column to the variable X and the sentiment column to the variable y
X = df1['review']
y = df1['sentiment']

# Use the train_test_split function to split the data into training and testing sets
# The test_size parameter is set to 0.2, meaning that 20% of the data will be used for testing
# The random_state parameter is set to 42 for reproducibility
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [77]:
# Import CountVectorizer from sklearn.feature_extraction.text
from sklearn.feature_extraction.text import CountVectorizer

# Create an object of CountVectorizer
cv = CountVectorizer()
# Use the fit_transform method to vectorize the text data in X_train
X_train_vect = cv.fit_transform(X_train)

# Import MultinomialNB from sklearn.naive_bayes
from sklearn.naive_bayes import MultinomialNB

# Create an object of MultinomialNB
clf = MultinomialNB()
# Use the fit method to fit the classifier with the training data
clf.fit(X_train_vect, y_train)

MultinomialNB()

In [78]:
# Use the transform method to vectorize the text data in X_test
X_test_vect = cv.transform(X_test)

# Use the predict method to predict the sentiment of the reviews in X_test
y_pred = clf.predict(X_test_vect)

In [79]:
# Import accuracy_score from sklearn.metrics
from sklearn.metrics import accuracy_score

# Use the accuracy_score function to calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)

# Print the accuracy of the model
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.79


In [80]:
# Create a list of new reviews
new_review = ["I love this app, it's so easy to use and has all the features I need."]

# Use the transform method of the cv object to vectorize the new reviews
new_review_vect = cv.transform(new_review)

In [81]:
# Use the predict method of the clf object to predict the sentiment of the new review
predicted_sentiment = clf.predict(new_review_vect)

# Print the predicted sentiment
print(predicted_sentiment)

['positive']


In [82]:
# Use the predict_proba method of the clf object to predict the probability of each sentiment class for the new review
predicted_sentiment_proba = clf.predict_proba(new_review_vect)
# Print the predicted sentiment probability
print(predicted_sentiment_proba)

[[7.72646553e-05 4.07456216e-09 9.99922731e-01]]
