## Import Library

In [1]:
import os
import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
import nltk

## Load Data

In [2]:
path = "/content/drive/MyDrive/AI/ATLA_2023/dataset/"

In [3]:
# Menggabungkan path dengan nama file
file_path = os.path.join(path, 'training.json')

# Membaca JSON newline-delimited dari StringIO object
df = pd.read_json(file_path, lines=True)

# Menampilkan beberapa baris pertama dari DataFrame
df.head()

Unnamed: 0,text,label,id
0,Have you ever heard of the Crusades? A time in...,1,0
1,"The professors, who likely have nearly a decad...",1,1
2,Kemba Walker does a good job of defending Foye...,1,2
3,"Ganias' lawyer, Stanley Twardy, urged the gove...",1,3
4,The Circuit Court of Appeals of New Jersey had...,0,4


## Preprocessing

In [4]:
# Fungsi untuk menghapus stop words
# def remove_stop_words(text):
#     stop_words = set(stopwords.words('english'))
#     words = text.split()
#     filtered_words = [word for word in words if word.lower() not in stop_words]
#     return ' '.join(filtered_words)

# # Terapkan fungsi untuk menghapus stop words ke kolom 'text'
# df['cleaned_text'] = df['text'].apply(remove_stop_words)

# Konfigurasi n-gram dan CountVectorizer
n_gram_range = (3, 4)
vectorizer = CountVectorizer(ngram_range=n_gram_range)

# Transformasikan teks menjadi fitur n-gram
X = vectorizer.fit_transform(df['text'])

# Mendapatkan frekuensi n-gram
n_gram_frequencies = X.sum(axis=0).A1
n_gram_features = vectorizer.get_feature_names_out()

# Buat DataFrame dari frekuensi n-gram
n_gram_df = pd.DataFrame({'n_gram': n_gram_features, 'frequency': n_gram_frequencies})

# Pilih top-k n-gram berdasarkan frekuensi
top_k = 10
top_k_n_grams = n_gram_df.nlargest(top_k, 'frequency')

# Tampilkan hasil
print(top_k_n_grams)

# List dari top-k n-gram
top_k_n_gram_list = top_k_n_grams['n_gram'].tolist()
print("Top-k n-grams:", top_k_n_gram_list)

                      n_gram  frequency
200017      court of appeals       1157
746515     the united states       1015
705222          the court of        945
500507          of the court        895
705225  the court of appeals        886
708762    the district court        785
504941         of the united        548
504946  of the united states        543
727314        the opinion of        540
522890        opinion of the        534
Top-k n-grams: ['court of appeals', 'the united states', 'the court of', 'of the court', 'the court of appeals', 'the district court', 'of the united', 'of the united states', 'the opinion of', 'opinion of the']


## Fine tuned roberta

In [6]:
!pip install transformers==4.40.0

Collecting transformers==4.40.0
  Downloading transformers-4.40.0-py3-none-any.whl (9.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.0/9.0 MB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.41.2
    Uninstalling transformers-4.41.2:
      Successfully uninstalled transformers-4.41.2
Successfully installed transformers-4.40.0


In [7]:
!pip install accelerate torch

Collecting accelerate
  Downloading accelerate-0.31.0-py3-none-any.whl (309 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-

In [5]:
import torch
import transformers
import accelerate

print(torch.__version__)
print(transformers.__version__)
print(accelerate.__version__)

2.3.0+cu121
4.40.0
0.31.0


In [None]:
!pip install datasets

In [8]:
from sklearn.metrics import accuracy_score
from datasets import Dataset
from transformers import RobertaTokenizer, RobertaForSequenceClassification, TrainingArguments, Trainer

# 1. Split the dataset
train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)

# Convert DataFrame to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df[['text', 'label']])
test_dataset = Dataset.from_pandas(test_df[['text', 'label']])

# 2. Load tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)  # Adjust num_labels for your task

# Tokenize the data
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# 3. Fine-tune the model
training_args = TrainingArguments(
    output_dir='./results',                  # Direktori output
    evaluation_strategy="epoch",             # Evaluasi setiap selesai satu epoch
    per_device_train_batch_size=16,          # Ukuran batch untuk pelatihan
    per_device_eval_batch_size=16,           # Ukuran batch untuk evaluasi
    num_train_epochs=3,                      # Jumlah epoch pelatihan
    weight_decay=0.01,                       # Besarnya weight decay
    logging_dir='./logs',                    # Direktori untuk menyimpan log
    logging_steps=10,                        # Log setiap 10 langkah
    fp16=True,                               # Menggunakan mixed precision training
    gradient_accumulation_steps=2,           # Menggunakan akumulasi gradient untuk batch size yang lebih besar
    learning_rate=2e-5,                      # Learning rate
    lr_scheduler_type='linear',              # Scheduler learning rate
)

# Check if a GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Modify the Trainer instantiation to use the GPU
trainer = Trainer(
    model=model,                         # The instantiated 🤗 Transformers model to be trained
    args=training_args,                  # Training arguments, defined above
    train_dataset=train_dataset,         # Training dataset
    eval_dataset=test_dataset,           # Evaluation dataset
    tokenizer=tokenizer
)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/16200 [00:00<?, ? examples/s]

Map:   0%|          | 0/1800 [00:00<?, ? examples/s]

In [9]:
trainer.train()

Epoch,Training Loss,Validation Loss
0,0.0705,0.087829
2,0.0414,0.128761


Epoch,Training Loss,Validation Loss
0,0.0705,0.087829
2,0.0247,0.120163


TrainOutput(global_step=1518, training_loss=0.08600282309544965, metrics={'train_runtime': 1820.96, 'train_samples_per_second': 26.689, 'train_steps_per_second': 0.834, 'total_flos': 1.27766728482816e+16, 'train_loss': 0.08600282309544965, 'epoch': 2.9970384995064165})

In [10]:
predictions = trainer.predict(test_dataset)
preds = predictions.predictions.argmax(-1)
labels = predictions.label_ids

accuracy = accuracy_score(labels, preds)
print(f'Accuracy: {accuracy}')

Accuracy: 0.98


## Save model

In [11]:
model_save_path = '/content/sample_data/roberta-without-remove-stopwords/'
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)

('/content/sample_data/roberta-without-remove-stopwords/tokenizer_config.json',
 '/content/sample_data/roberta-without-remove-stopwords/special_tokens_map.json',
 '/content/sample_data/roberta-without-remove-stopwords/vocab.json',
 '/content/sample_data/roberta-without-remove-stopwords/merges.txt',
 '/content/sample_data/roberta-without-remove-stopwords/added_tokens.json')

In [12]:
!zip -r /content/sample_data/roberta.zip /content/sample_data/roberta-without-remove-stopwords/

  adding: content/sample_data/roberta-without-remove-stopwords/ (stored 0%)
  adding: content/sample_data/roberta-without-remove-stopwords/training_args.bin (deflated 51%)
  adding: content/sample_data/roberta-without-remove-stopwords/merges.txt (deflated 53%)
  adding: content/sample_data/roberta-without-remove-stopwords/model.safetensors (deflated 10%)
  adding: content/sample_data/roberta-without-remove-stopwords/special_tokens_map.json (deflated 84%)
  adding: content/sample_data/roberta-without-remove-stopwords/vocab.json (deflated 68%)
  adding: content/sample_data/roberta-without-remove-stopwords/tokenizer_config.json (deflated 76%)
  adding: content/sample_data/roberta-without-remove-stopwords/config.json (deflated 51%)


In [13]:
# Tentukan direktori model yang disimpan
# model_save_path = '/kaggle/working/alta2023/'

# 2. Load tokenizer and model
tokenizer_saved = RobertaTokenizer.from_pretrained(model_save_path)
model_saved = RobertaForSequenceClassification.from_pretrained(model_save_path, num_labels=2)  # Adjust num_labels for your task

# Menggunakan model dan tokenizer
text = "Assistant Attorney General Oberdorfer presented the case on behalf of the United States. He was joined on the briefs by former Solicitor General Rankin, Solicitor General Cox, and Harry Baum."

# Preprocess input text
inputs = tokenizer_saved(text, return_tensors='pt')

# Memberikan input yang telah dipreprocessing ke model
outputs = model_saved(**inputs)

# Mendapatkan logits dan prediksi
logits = outputs.logits
predictions = logits.argmax(dim=-1)

# Menampilkan prediksi
print(f"Prediksi: {predictions.item()}")

Prediksi: 0


In [14]:
test_path = "/content/drive/MyDrive/AI/ATLA_2023/dataset/"
# Menggabungkan path dengan nama file
test_file = os.path.join(test_path, 'test_data.json')

# Membaca JSON newline-delimited dari StringIO object
df_test = pd.read_json(test_file, lines=True)

# Menampilkan beberapa baris pertama dari DataFrame
df_test.head()

Unnamed: 0,id,text
0,0,Investigators are now hamstrung by the inabili...
1,1,"[10] Indeed, the District Court found that pe..."
2,2,"""The second object of this legislation is to p..."
3,3,"It is in vain, in a case of this nature, that ..."
4,4,*4 Mr. Justice WAYNE delivered the opinion of ...


In [15]:
def get_prediction(text):
    # Preprocess input text
    inputs = tokenizer_saved(text, return_tensors='pt')

    # Memberikan input yang telah dipreprocessing ke model
    outputs = model_saved(**inputs)

    # Mendapatkan logits dan prediksi
    logits = outputs.logits
    prediction = logits.argmax(dim=-1).item()

    return prediction

# Apply the function to the text column and create a new label column
df_test['label'] = df_test['text'].apply(get_prediction)

# Menampilkan DataFrame dengan kolom label baru
print(df_test)

        id                                               text  label
0        0  Investigators are now hamstrung by the inabili...      1
1        1  [10]  Indeed, the District Court found that pe...      0
2        2  "The second object of this legislation is to p...      0
3        3  It is in vain, in a case of this nature, that ...      0
4        4  *4 Mr. Justice WAYNE delivered the opinion of ...      0
...    ...                                                ...    ...
1995  1995  Pamela is due to finish cutting the second tun...      1
1996  1996  The opinion in that case, when stated, is, tha...      0
1997  1997  A Justice Department statement went no further...      1
1998  1998  Higher Ground Reach the top of the map! Reward...      1
1999  1999  Declaring variables local to their use is not ...      1

[2000 rows x 3 columns]


In [16]:
df_test_1 = df_test[['id', 'label']].copy()
df_test_1.head()

Unnamed: 0,id,label
0,0,1
1,1,0
2,2,0
3,3,0
4,4,0


In [17]:
import json

# Mengkonversi DataFrame ke format dictionary
json_lines = df_test_1.to_dict(orient='records')

# Menentukan jalur file output
output_file_path = '/content/drive/MyDrive/AI/ATLA_2023/model_saved/roberta/answer_1.json'

try:
    # Menulis data ke file dalam format JSON lines
    with open(output_file_path, 'w') as f:
        for item in json_lines:
            json.dump(item, f)
            f.write('\n')
    print(f"Data telah disimpan ke {output_file_path}")
except Exception as e:
    print(f"Terjadi kesalahan saat menulis file: {e}")

# Verifikasi bahwa file telah berhasil disimpan
import os

if os.path.exists(output_file_path):
    print(f"File {output_file_path} berhasil dibuat.")
else:
    print(f"File {output_file_path} tidak ditemukan.")

Data telah disimpan ke /content/drive/MyDrive/AI/ATLA_2023/model_saved/roberta/answer_1.json
File /content/drive/MyDrive/AI/ATLA_2023/model_saved/roberta/answer_1.json berhasil dibuat.
