In [7]:
import os
import pandas as pd

dataset_path = os.path.realpath(os.path.join(os.path.dirname(__name__), '..', 'app','static','uploads', 'dataset.csv'))
df_comments = pd.read_csv(dataset_path)
df_comments.tail(n=99)

Unnamed: 0,comment,label
0,Aku mampir kesini karena mau coba beli alhamdu...,positif
1,Udah pake sleeping mask yang yuja ini dari per...,positif
2,Sleeping masknya juaraaaaa habisnya lama banget,positif
3,kalo pake moisturizer kog poriku jadi keliatan...,negatif
4,Aku lg pake yg luminous beneran bikin glowing ...,negatif
5,wah ternyata bukan aku aja yang gak cocok sama...,negatif


In [8]:
# Melihat jumlah data
df_comments['label'].value_counts()

label
positif    3
negatif    3
Name: count, dtype: int64

#### Preprocessing

In [9]:
import re
import string
from nltk.tokenize import word_tokenize
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from nltk.corpus import stopwords

stemmer = StemmerFactory().create_stemmer()
stopword_factory = StopWordRemoverFactory()
combined_stopwords = set(stopword_factory.get_stop_words()).union(set(stopwords.words('english')))

def clean_text(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text

def preprocess_text(text):
    text = clean_text(text).lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in combined_stopwords]
    stemmed = [stemmer.stem(word) for word in tokens]
    return ' '.join(stemmed)


df_comments['preprocess'] = df_comments['comment'].apply(preprocess_text)


#### Transform TF data Test dengan data Train

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

new_comment = 'alhamdulillah' # text train
X_test = preprocess_text(new_comment)

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(df_comments['preprocess'])
y_train = df_comments['label']

terms = vectorizer.get_feature_names_out()
idf_values = vectorizer.idf_

X_test = vectorizer.transform([X_test])
print(y_train) # cetak label data train

0    positif
1    positif
2    positif
3    negatif
4    negatif
5    negatif
Name: label, dtype: object


In [11]:
from collections import Counter

# Function to compute raw TF (counts of terms in document)
def compute_raw_tf(doc):
    words = doc.split()
    count = Counter(words)
    return count

# Compute normalized TF
def compute_tf(doc):
    words = doc.split()
    count = Counter(words)
    total_terms = len(words)
    tf = {term: count[term] / total_terms for term in count}
    return tf

# Create DataFrame for TF-IDF
tfidf_df = pd.DataFrame(X_train.toarray().T, index=terms, columns=[f'D{i+1}' for i in range(len(df_comments['preprocess']))])
idf_df = pd.DataFrame(idf_values, index=terms, columns=["IDF"])

# Compute raw TF for each document (term count)
raw_tf_dicts = [compute_raw_tf(doc) for doc in df_comments['preprocess']]
raw_tf_df = pd.DataFrame(raw_tf_dicts, index=[f'D{i+1}' for i in range(len(df_comments['preprocess']))]).T


# Compute normalized TF for each document
tf_dicts = [compute_tf(doc) for doc in df_comments['preprocess']]
tf_df = pd.DataFrame(tf_dicts, index=[f'D{i+1}' for i in range(len(df_comments['preprocess']))]).T

# Fill NaN values with 0
raw_tf_df = raw_tf_df.fillna(0)
tf_df = tf_df.fillna(0)
idf_df = idf_df.fillna(0)
tfidf_df = tfidf_df.fillna(0)

# Sum all normalized TF values across all documents (TFNormAll)
tf_norm_all = tf_df.sum(axis=1)

# Compute Document Frequency (DF) - number of documents where the term appears
df_values = (raw_tf_df > 0).sum(axis=1)

# Create final DataFrame
final_df = pd.DataFrame(index=terms)
final_df['Terms'] = terms

# Add raw TF for each document (Raw Terms per document)
final_df = final_df.join(raw_tf_df.add_prefix('TF'))  # Add raw term counts for each document

# Add normalized TF for each document
final_df = final_df.join(tf_df.add_prefix('TFN'))  # Add normalized TF for each document

# Add sum of normalized TFs across all documents
final_df['TFNAll'] = tf_norm_all

# Add Document Frequency (DF) column
final_df['DF'] = df_values

# Add IDF column
final_df['IDF'] = idf_df['IDF']

# Add TF-IDF for each document
final_df = final_df.join(tfidf_df.add_prefix('TFIDF_'))

# Round all numeric columns to 3 decimal places
final_df = final_df.round(3)

# Export the final DataFrame to CSV
final_df.to_csv('train_metrics.csv', index=False)

#### Latih Model

In [12]:
from sklearn.svm import SVC
model = SVC(random_state=0, kernel='linear')
model.fit(X_train, y_train)

#### Prediksi Kelas Comment

In [13]:
predict = model.predict(X_test)
print(predict)

['positif']
