In [1]:
!pip install nltk



In [2]:
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from google.colab import drive

In [3]:
# Hubungkan Colab dengan Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# Path ke file dalam Google Drive
file_path = '/content/drive/My Drive/Colab Notebooks/datasetJurusanSekolah.csv'
data = pd.read_csv(file_path)

In [5]:
# Unduh data pendukung NLTK
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')

# Inisialisasi
stop_words = set(stopwords.words('indonesian') + stopwords.words('english'))
additional_stopwords = {
    "gw", "gua", "gwe", "aku", "kamu", "saya", "loe", "lu", "kita", "mereka", "nya",
    "aja", "dong", "sih", "deh", "nih", "tuh", "bakal", "bikin", "kayak", "buat", "mau"
}
stop_words.update(additional_stopwords)
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [6]:
def preprocess_text_nltk(text):
    # Lowercasing
    text = text.lower()
    # Menghapus tanda baca
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenisasi kata
    tokens = word_tokenize(text)
    # Menghapus angka, stopwords, dan lemmatization
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalpha() and word not in stop_words]
    # Gabungkan kembali menjadi string
    return ' '.join(tokens)

In [7]:
# Terapkan preprocessing ke kolom "Essay"
data['Essay_cleaned'] = data['Essay'].apply(preprocess_text_nltk)

In [8]:
print(data[['Essay', 'Essay_cleaned']])

                                                  Essay  \
0     gw suka mainin travo dalam desa dan tertarik d...   
1         Saya suka pembangunan jalan seperti jalan tol   
2     Saya ingin membangun jalanan untuk area yang m...   
3     saya tertarik dengan pembuatan irigasi untuk s...   
4     aku mau buat irigasi untuk yang bagus untuk pe...   
...                                                 ...   
2003  Aku ingin bekerja di bidang kehutanan untuk me...   
2004  Gua ingin mempelajari teknik pemetaan hutan un...   
2005  Saya ingin mengembangkan teknik-teknik rehabil...   
2006  Aku ingin mendalami teknik konservasi hutan un...   
2007  Gua tertarik untuk bekerja di bidang kehutanan...   

                                          Essay_cleaned  
0                 suka mainin travo desa tertarik topik  
1                      suka pembangunan jalan jalan tol  
2            membangun jalanan area jalannya berantakan  
3                      tertarik pembuatan irigasi sawah  
4

In [9]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [10]:
X = data['Essay_cleaned']
y = data.iloc[:, 1]

# Encoding label (dari teks ke angka)
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Konversi label menjadi one-hot encoding
y_onehot = to_categorical(y_encoded)

# Bagi data menjadi data latih dan uji
X_train, X_test, y_train, y_test = train_test_split(X, y_onehot, test_size=0.2, random_state=42)

In [11]:
# TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=5000)  # Maksimal 5000 fitur untuk efisiensi
X_train_tfidf = tfidf.fit_transform(X_train).toarray()
X_test_tfidf = tfidf.transform(X_test).toarray()

In [12]:
num_classes = len(np.unique(y))  # Menghitung jumlah kelas unik (seharusnya 11)
print(f"Jumlah kelas unik: {num_classes}")

# One-Hot Encoding untuk 11 kelas
y_onehot = to_categorical(y_encoded, num_classes=num_classes)

Jumlah kelas unik: 11


In [23]:
model = Sequential([
    Dense(512, input_dim=X_train_tfidf.shape[1], activation='relu'),  # Hidden layer pertama
    Dense(256, activation='relu'),  # Hidden layer kedua
    Dense(num_classes, activation='softmax')  # Output layer untuk 11 kelas
])
model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [24]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
history = model.fit(X_train_tfidf, y_train, epochs=50, batch_size=32, validation_split=0.2, verbose=1)

Epoch 1/50
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 16ms/step - accuracy: 0.2401 - loss: 2.3507 - val_accuracy: 0.6615 - val_loss: 1.9596
Epoch 2/50
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.8370 - loss: 1.4259 - val_accuracy: 0.8416 - val_loss: 0.6659
Epoch 3/50
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.9731 - loss: 0.2470 - val_accuracy: 0.8758 - val_loss: 0.4463
Epoch 4/50
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.9882 - loss: 0.0760 - val_accuracy: 0.8944 - val_loss: 0.4032
Epoch 5/50
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.9972 - loss: 0.0251 - val_accuracy: 0.8820 - val_loss: 0.3921
Epoch 6/50
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.9995 - loss: 0.0145 - val_accuracy: 0.8975 - val_loss: 0.3872
Epoch 7/50
[1m41/41[0m [32m━━━━

In [25]:
# Evaluasi Model
loss, accuracy = model.evaluate(X_test_tfidf, y_test, verbose=0)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

# Evaluasi Detail
y_pred = model.predict(X_test_tfidf)
y_pred_classes = np.argmax(y_pred, axis=1)
y_test_classes = np.argmax(y_test, axis=1)

print("\nClassification Report:\n")
print(classification_report(y_test_classes, y_pred_classes, target_names=label_encoder.classes_))

Test Accuracy: 89.55%
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step

Classification Report:

                                        precision    recall  f1-score   support

                               SMA IPA       0.70      0.89      0.78        44
                               SMA IPS       0.94      0.84      0.89        38
      SMK Agribisnis dan Agroteknologi       0.97      0.97      0.97        35
              SMK Bisnis dan Manajemen       1.00      0.97      0.98        33
           SMK Energi dan Pertambangan       0.93      0.91      0.92        43
                      SMK Kemaritiman        1.00      1.00      1.00        32
    SMK Kesehatan dan Pekerjaan Sosial       0.89      0.89      0.89        28
                        SMK Pariwisata       1.00      0.97      0.99        38
         SMK Seni dan Industri Kreatif       0.87      0.93      0.90        42
SMK Teknologi Informasi dan Komunikasi       0.92      0.64      0.75        3

In [16]:
def preprocess_input_text(text):
    # Lowercasing
    text = text.lower()
    # Menghapus tanda baca
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenisasi kata
    tokens = word_tokenize(text)
    # Menghapus angka, stopwords, dan lemmatization
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalpha() and word not in stop_words]
    # Gabungkan kembali menjadi string
    return ' '.join(tokens)

# Input teks baru
new_text = "saya mau belajar matematika"  # Ganti dengan kalimat lain
preprocessed_text = preprocess_input_text(new_text)

# Transformasi teks menjadi vektor TF-IDF
new_text_tfidf = tfidf.transform([preprocessed_text]).toarray()

# Prediksi menggunakan model
predicted_class = model.predict(new_text_tfidf)
predicted_label = label_encoder.inverse_transform([np.argmax(predicted_class)])

# Output prediksi
print(f"Teks Input: {new_text}")
print(f"Prediksi Jurusan: {predicted_label[0]}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step
Teks Input: saya mau belajar matematika
Prediksi Jurusan: SMA IPA
