In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string

In [3]:
# Download NLTK data
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\usera\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\usera\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\usera\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
headers = ["label", "title", "text"] 

data = pd.read_csv("Dataset Pertama_train.csv", header=None, names=headers)
data.head()

Unnamed: 0,label,title,text
0,2,Stuning even for the non-gamer,This sound track was beautiful! It paints the ...
1,2,The best soundtrack ever to anything.,I'm reading a lot of reviews saying that this ...
2,2,Amazing!,This soundtrack is my favorite music of all ti...
3,2,Excellent Soundtrack,I truly like this soundtrack and I enjoy video...
4,2,"Remember, Pull Your Jaw Off The Floor After He...","If you've played the game, you know how divine..."


In [5]:
# Checking for data null
data.isna().sum()

label      0
title    207
text       0
dtype: int64

In [6]:
# Fill the null values with empty string
data = data.fillna(" ")
data.isna().sum() # Checking for null values agai

label    0
title    0
text     0
dtype: int64

In [7]:
# Menginisialisasi stopwords
stop_words = set(stopwords.words("english"))

#Nanti run 2 pake lemmatizer
lemmatizer = WordNetLemmatizer()

In [8]:
# Bikin function untuk text preprocessing
def preprocess_text(text):
  tokens = word_tokenize(text.lower())
  tokens = [word for word in tokens if word.isalnum()] # Remove punctuations in the text
  tokens = [word for word in tokens if word not in stop_words] # Remove all stopwords from the text
  tokens = [lemmatizer.lemmatize(word) for word in tokens] # Lemmatize the words
  return " ".join(tokens)

In [9]:
# Gabung title dan text ke dalam 1 hal untuk di review
data['combined_text'] = (data['title'] + " " + data['text']).apply(preprocess_text)

In [10]:
# Melakukan transformasi data dari text ke dalam numerik supaya data lebih mudah dibaca oleh machine learning
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(data["combined_text"])
y = data["label"]

In [11]:
# Split data menjadi train and text dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [12]:
# Menggunakan logistic regression model dan training modelnya
clf = LogisticRegression()
clf.fit(X_train, y_train)

In [13]:
# Membuat prediksi dan evaluasi
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy: .4f}")
print("Classification Report: \n", classification_report(y_test, y_pred))
print("Confusion Matrix: \n" ,cm)

Accuracy:  0.8870
Classification Report: 
               precision    recall  f1-score   support

           1       0.89      0.88      0.89    360000
           2       0.88      0.89      0.89    360000

    accuracy                           0.89    720000
   macro avg       0.89      0.89      0.89    720000
weighted avg       0.89      0.89      0.89    720000

Confusion Matrix: 
 [[317956  42044]
 [ 39323 320677]]
