# Dataset 1: Amazon Reviews

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
import re
import contractions

In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string

In [None]:
# Download NLTK data
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

In [4]:
headers = ["label", "title", "text"] 

data = pd.read_csv("Dataset Pertama/Dataset Pertama_train.csv", header=None, names=headers)
data.head()

Unnamed: 0,label,title,text
0,2,Stuning even for the non-gamer,This sound track was beautiful! It paints the ...
1,2,The best soundtrack ever to anything.,I'm reading a lot of reviews saying that this ...
2,2,Amazing!,This soundtrack is my favorite music of all ti...
3,2,Excellent Soundtrack,I truly like this soundtrack and I enjoy video...
4,2,"Remember, Pull Your Jaw Off The Floor After He...","If you've played the game, you know how divine..."


In [5]:
# Checking for data null
data.isna().sum()

label      0
title    207
text       0
dtype: int64

In [6]:
# Fill the null values with empty string
data = data.fillna(" ")
data.isna().sum() # Checking for null values agai

label    0
title    0
text     0
dtype: int64

In [7]:
# Menginisialisasi stopwords
stop_words = set(stopwords.words("english"))

#Nanti run 2 pake lemmatizer
lemmatizer = WordNetLemmatizer()

In [8]:
# Handle Contractions
def expand_contractions(text):
  return contractions.fix(text)

In [9]:
#Handle Negations
def handle_negations(words):
  negation_words = {"not", "no", "never", "n't"}
  processed_words = []

  i = 0
  while i < len(words):
    if words[i] in negation_words and i+1 < len(words):
      processed_words.append(words[i] + "_" + words[i+1]) # Combine negation with next word
      i+=2
    else:
      processed_words.append(words[i])
      i+= 1
  return processed_words


In [10]:
# Bikin function untuk text preprocessing
def preprocess_text(text):
  tokens = word_tokenize(text.lower())
  text = re.sub(r'\d+', '', text)
  text = text.translate(str.maketrans("", "", string.punctuation))
  words = word_tokenize(text)
  words = handle_negations(words)
  words = [word for word in words if word not in stop_words] # Remove all stopwords from the text
  tokens = [lemmatizer.lemmatize(word) for word in words] # Lemmatize the words
  return " ".join(words)

In [11]:
# Gabung title dan text ke dalam 1 hal untuk di review
data['combined_text'] = (data['title'] + " " + data['text'])

data["combined_text"] = data["combined_text"].apply(expand_contractions)

data["conbined_text"] = data["combined_text"].apply(preprocess_text)

In [12]:
# Melakukan transformasi data dari text ke dalam numerik supaya data lebih mudah dibaca oleh machine learning
vectorizer = TfidfVectorizer(
  ngram_range=(1,2), 
  max_features=5000, 
  sublinear_tf=True
  )

X = vectorizer.fit_transform(data["combined_text"])
y = data["label"]

In [None]:
# Split data menjadi train and text dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
# Menggunakan logistic regression model dan training modelnya
params = {
  'C': [0.01, 0.1, 1, 10],
  'penalty': ['l1', 'l2'],
  'solver': ['liblinear', 'saga']
}

clf = LogisticRegression(max_iter=5000)
from sklearn.experimental import enable_halving_search_cv  # noqa
from sklearn.model_selection import HalvingGridSearchCV

grid_search = HalvingGridSearchCV(clf, params, cv=5, scoring='accuracy', n_jobs=2)
grid_search.fit(X_train, y_train)

print("Best Parameter: ", grid_search.best_params_)

Best Parameter:  {'C': 1, 'penalty': 'l1', 'solver': 'saga'}


In [16]:
# Membuat prediksi dan evaluasi
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy: .4f}")
print("Classification Report: \n", classification_report(y_test, y_pred))
print("Confusion Matrix: \n" ,cm)

Accuracy:  0.9160
Classification Report: 
               precision    recall  f1-score   support

           1       0.92      0.92      0.92    360000
           2       0.92      0.92      0.92    360000

    accuracy                           0.92    720000
   macro avg       0.92      0.92      0.92    720000
weighted avg       0.92      0.92      0.92    720000

Confusion Matrix: 
 [[329636  30364]
 [ 30123 329877]]
