In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from multiprocessing import cpu_count
n_cores = cpu_count()
print(f'Number of Logical CPU cores: {n_cores}')

In [None]:
import psutil

# Mendapatkan total memori (RAM)
total_memory = psutil.virtual_memory().total

# Mengonversi dari byte ke gigabyte
total_memory_gb = total_memory / (1024 ** 3)
print(f"Total RAM: {total_memory_gb:.2f} GB")

In [None]:
import os
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
import nltk

# Pastikan Anda telah mengunduh stopwords NLTK
nltk.download('stopwords')

In [None]:
!pip install transformers datasets scikit-learn

In [None]:
!pip install accelerate torch

In [None]:
!pip install transformers[torch]

In [None]:
import torch
import transformers
import accelerate

print(torch.__version__)
print(transformers.__version__)
print(accelerate.__version__)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from datasets import Dataset
from transformers import AlbertTokenizer, AlbertForSequenceClassification, TrainingArguments, Trainer

In [None]:
path = "/kaggle/input/training-and-validation"
# Menggabungkan path dengan nama file
file_path = os.path.join(path, 'training.json')

# Membaca JSON newline-delimited dari StringIO object
df = pd.read_json(file_path, lines=True)

# Menampilkan beberapa baris pertama dari DataFrame
df.head()

In [None]:
n_gram_range = (3, 4)
vectorizer = CountVectorizer(ngram_range=n_gram_range)

# Transformasikan teks menjadi fitur n-gram
X = vectorizer.fit_transform(df['text'])

# Mendapatkan frekuensi n-gram
n_gram_frequencies = X.sum(axis=0).A1
n_gram_features = vectorizer.get_feature_names_out()

# Buat DataFrame dari frekuensi n-gram
n_gram_df = pd.DataFrame({'n_gram': n_gram_features, 'frequency': n_gram_frequencies})

# Pilih top-k n-gram berdasarkan frekuensi
top_k = 10
top_k_n_grams = n_gram_df.nlargest(top_k, 'frequency')

# Tampilkan hasil
print(top_k_n_grams)

# List dari top-k n-gram
top_k_n_gram_list = top_k_n_grams['n_gram'].tolist()
print("Top-k n-grams:", top_k_n_gram_list)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from datasets import Dataset
from transformers import AlbertTokenizer, AlbertForSequenceClassification, TrainingArguments, Trainer

# 1. Split the dataset
train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)

# Convert DataFrame to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df[['text', 'label']])
test_dataset = Dataset.from_pandas(test_df[['text', 'label']])

# 2. Load tokenizer and model
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=2)  # Adjust num_labels for your task

# Tokenize the data
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# 3. Fine-tune the model
training_args = TrainingArguments(
#     fp16=False,  # Disable FP16 mixed precision training
#     fp16_full_eval=False, # Disable FP16 evaluation
    
    output_dir='./results',                  # Direktori output
    evaluation_strategy="epoch",             # Evaluasi setiap selesai satu epoch
    per_device_train_batch_size=16,          # Ukuran batch untuk pelatihan
    per_device_eval_batch_size=16,           # Ukuran batch untuk evaluasi
    num_train_epochs=3,                      # Jumlah epoch pelatihan
    weight_decay=0.01,                       # Besarnya weight decay
    logging_dir='./logs',                    # Direktori untuk menyimpan log
    logging_steps=10,                        # Log setiap 10 langkah
    fp16=True,                               # Menggunakan mixed precision training
    gradient_accumulation_steps=2,           # Menggunakan akumulasi gradient untuk batch size yang lebih besar
    learning_rate=2e-5,                      # Learning rate
    lr_scheduler_type='linear',              # Scheduler learning rate
)

# Check if a GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Modify the Trainer instantiation to use the GPU
trainer = Trainer(
    model=model,                         # The instantiated 🤗 Transformers model to be trained
    args=training_args,                  # Training arguments, defined above
    train_dataset=train_dataset,         # Training dataset
    eval_dataset=test_dataset,           # Evaluation dataset
    tokenizer=tokenizer
)

In [None]:
trainer.train()

In [None]:
# 4. Evaluate the model
predictions = trainer.predict(test_dataset)
preds = predictions.predictions.argmax(-1)
labels = predictions.label_ids

accuracy = accuracy_score(labels, preds)
print(f'Accuracy: {accuracy}')

In [None]:
model_save_path = '/kaggle/working/alta2023/'
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)

In [None]:
# Tentukan direktori model yang disimpan
model_save_path = '/kaggle/working/alta2023/'
 
# 2. Load tokenizer and model
tokenizer_saved = AlbertTokenizer.from_pretrained(model_save_path)
model_saved = AlbertForSequenceClassification.from_pretrained(model_save_path, num_labels=2)  # Adjust num_labels for your task

# Menggunakan model dan tokenizer
text = "Assistant Attorney General Oberdorfer presented the case on behalf of the United States. He was joined on the briefs by former Solicitor General Rankin, Solicitor General Cox, and Harry Baum."

# Preprocess input text
inputs = tokenizer_saved(text, return_tensors='pt')

# Memberikan input yang telah dipreprocessing ke model
outputs = model_saved(**inputs)

# Mendapatkan logits dan prediksi
logits = outputs.logits
predictions = logits.argmax(dim=-1)

# Menampilkan prediksi
print(f"Prediksi: {predictions.item()}")

In [None]:
test_path = "/kaggle/input/test-alta2023"
# Menggabungkan path dengan nama file
test_file = os.path.join(test_path, 'test_data.json')

# Membaca JSON newline-delimited dari StringIO object
df_test = pd.read_json(test_file, lines=True)

# Menampilkan beberapa baris pertama dari DataFrame
df_test.head()

In [None]:
def get_prediction(text):
    # Preprocess input text
    inputs = tokenizer_saved(text, return_tensors='pt')
    
    # Memberikan input yang telah dipreprocessing ke model
    outputs = model_saved(**inputs)
    
    # Mendapatkan logits dan prediksi
    logits = outputs.logits
    prediction = logits.argmax(dim=-1).item()
    
    return prediction

# Apply the function to the text column and create a new label column
df_test['label'] = df_test['text'].apply(get_prediction)

# Menampilkan DataFrame dengan kolom label baru
print(df_test)

In [None]:
df_test_1 = df_test[['id', 'label']].copy()
df_test_1.head()

In [None]:
import json

In [None]:
# Mengkonversi DataFrame ke format dictionary
json_lines = df_test_1.to_dict(orient='records')

# Menentukan jalur file output
output_file_path = '/kaggle/working/answer_tanpa_stopword_removal.json'

try:
    # Menulis data ke file dalam format JSON lines
    with open(output_file_path, 'w') as f:
        for item in json_lines:
            json.dump(item, f)
            f.write('\n')
    print(f"Data telah disimpan ke {output_file_path}")
except Exception as e:
    print(f"Terjadi kesalahan saat menulis file: {e}")

# Verifikasi bahwa file telah berhasil disimpan
import os

if os.path.exists(output_file_path):
    print(f"File {output_file_path} berhasil dibuat.")
else:
    print(f"File {output_file_path} tidak ditemukan.")

In [None]:
path1 = "/kaggle/working"
# Menggabungkan path dengan nama file
file_path1 = os.path.join(path1, 'answer_tanpa_stopword_removal.json')

# Membaca JSON newline-delimited dari StringIO object
df1 = pd.read_json(file_path1, lines=True)
df1

In [None]:
path2 = "/kaggle/input/sample"
# Menggabungkan path dengan nama file
file_path2 = os.path.join(path2, 'validation_sample_output.json')

# Membaca JSON newline-delimited dari StringIO object
df2 = pd.read_json(file_path2, lines=True)
df2