In [None]:
import os
import pandas as pd

dataset_path = os.path.realpath(os.path.join(os.path.dirname(__name__), '..', 'app','static','uploads', 'dataset.csv'))
df_comments = pd.read_csv(dataset_path)
df_comments.tail(n=99)

In [None]:
# Melihat jumlah data
df_comments['label'].value_counts()

#### Preprocessing

In [6]:
import re
import string
from nltk.tokenize import word_tokenize
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from nltk.corpus import stopwords

stemmer = StemmerFactory().create_stemmer()
stopword_factory = StopWordRemoverFactory()
combined_stopwords = set(stopword_factory.get_stop_words()).union(set(stopwords.words('english')))

def clean_text(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text

def preprocess_text(text):
    text = clean_text(text).lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in combined_stopwords]
    stemmed = [stemmer.stem(word) for word in tokens]
    return ' '.join(stemmed)


df_comments['preprocess'] = df_comments['comment'].apply(preprocess_text)


#### Transform TF data Test dengan data Train

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(df_comments['preprocess'])
y_train = df_comments['label']

terms = vectorizer.get_feature_names_out()
idf_values = vectorizer.idf_

print(X_train) # cetak label data train

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 23 stored elements and shape (2, 22)>
  Coords	Values
  (0, 7)	0.22346730896573733
  (0, 11)	0.44693461793147465
  (0, 20)	0.44693461793147465
  (0, 4)	0.22346730896573733
  (0, 6)	0.22346730896573733
  (0, 0)	0.3179976616659477
  (0, 16)	0.22346730896573733
  (0, 2)	0.22346730896573733
  (0, 18)	0.22346730896573733
  (0, 5)	0.22346730896573733
  (0, 9)	0.22346730896573733
  (0, 19)	0.22346730896573733
  (0, 13)	0.22346730896573733
  (1, 0)	0.23076792961123066
  (1, 17)	0.32433627313894553
  (1, 15)	0.32433627313894553
  (1, 3)	0.32433627313894553
  (1, 14)	0.32433627313894553
  (1, 21)	0.32433627313894553
  (1, 8)	0.32433627313894553
  (1, 12)	0.32433627313894553
  (1, 1)	0.32433627313894553
  (1, 10)	0.32433627313894553


In [30]:
from collections import Counter
import pandas as pd

# Function to compute raw TF (counts of terms in document)
def compute_raw_tf(doc):
    words = doc.split()
    count = Counter(words)
    return count

# Compute normalized TF
def compute_tf(doc):
    words = doc.split()
    count = Counter(words)
    total_terms = len(words)
    tf = {term: count[term] / total_terms for term in count}
    return tf

# Create DataFrame for IDF
idf_df = pd.DataFrame(idf_values, index=terms, columns=["IDF"])

# Compute raw TF for each document (term count)
raw_tf_dicts = [compute_raw_tf(doc) for doc in df_comments['preprocess']]
raw_tf_df = pd.DataFrame(raw_tf_dicts, index=[f'D{i+1}' for i in range(len(df_comments['preprocess']))]).T

# Compute normalized TF for each document
tf_dicts = [compute_tf(doc) for doc in df_comments['preprocess']]
tf_df = pd.DataFrame(tf_dicts, index=[f'D{i+1}' for i in range(len(df_comments['preprocess']))]).T

# Fill NaN values with 0
raw_tf_df = raw_tf_df.fillna(0)
tf_df = tf_df.fillna(0)
idf_df = idf_df.fillna(0)


# Compute Document Frequency (DF) - number of documents where the term appears
df_values = (raw_tf_df > 0).sum(axis=1)

# Create final DataFrame
final_df = pd.DataFrame(index=terms)
final_df['Terms'] = terms

# Add raw TF for each document (Raw Terms per document)
final_df = final_df.join(raw_tf_df.add_prefix('TF'))  # Add raw term counts for each document

# Add normalized TF for each document
final_df = final_df.join(tf_df.add_prefix('TFN'))  # Add normalized TF for each document

# Add Document Frequency (DF) column
final_df['DF'] = df_values

# Add IDF column
final_df['IDF'] = idf_df['IDF']

# Calculate manual TF-IDF by multiplying normalized TF with IDF for each document
for doc in [f'D{i+1}' for i in range(len(df_comments['preprocess']))]:
    final_df[f'TFIDF_{doc}'] = final_df[f'TFN{doc}'] * final_df['IDF']

# Round all numeric columns to 3 decimal places
final_df = final_df.round(3)

# Export the final DataFrame to CSV
final_df.to_csv('train_metrics.csv', index=False)


#### Latih Model

In [31]:
from sklearn.svm import SVC
import joblib

result_path = result_path = os.getcwd()
model_file = os.path.join(result_path, 'trained_model.pkl')

if os.path.exists(model_file):
    os.remove(model_file)

print(X_train)
print(vectorizer.get_feature_names_out())

model = SVC(random_state=0, kernel='linear')
model.fit(X_train, y_train)
joblib.dump(model, model_file)


<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 23 stored elements and shape (2, 22)>
  Coords	Values
  (0, 7)	0.2412282461053091
  (0, 11)	0.40843492476462473
  (0, 20)	0.40843492476462473
  (0, 4)	0.2412282461053091
  (0, 6)	0.2412282461053091
  (0, 0)	0.290604812889593
  (0, 16)	0.2412282461053091
  (0, 2)	0.2412282461053091
  (0, 18)	0.2412282461053091
  (0, 5)	0.2412282461053091
  (0, 9)	0.2412282461053091
  (0, 19)	0.2412282461053091
  (0, 13)	0.2412282461053091
  (1, 0)	0.23076792961123066
  (1, 17)	0.32433627313894553
  (1, 15)	0.32433627313894553
  (1, 3)	0.32433627313894553
  (1, 14)	0.32433627313894553
  (1, 21)	0.32433627313894553
  (1, 8)	0.32433627313894553
  (1, 12)	0.32433627313894553
  (1, 1)	0.32433627313894553
  (1, 10)	0.32433627313894553
['aku' 'bagi' 'banyak' 'bau' 'beauty' 'beli' 'blogger' 'cuma' 'gatel'
 'hasil' 'hidung' 'ka' 'kulit' 'luvv' 'nyengat' 'pake' 'percaya' 'pernah'
 'produk' 'review' 'tasya' 'trus']


['c:\\000-Python-Project\\ran-svm\\notebook\\trained_model.pkl']

#### Prediksi Kelas Comment

In [None]:
new_comment = 'gatel' # text train
X_test = preprocess_text(new_comment)

X_test = vectorizer.transform([X_test])
predict = model.predict(X_test)
print(predict)