In [1]:
import re
import nltk
import string
import torch
import pickle
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, T5ForConditionalGeneration
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download NLP resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Initialize Lemmatizer & Stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Load dataset ACLSum
dataset = load_dataset("sobamchan/aclsum", split="train")

# Function untuk membersihkan teks
def clean_text(text):
    text = text.lower()  # Lowercasing
    text = re.sub(r'\d+', '', text)  # Menghapus angka
    text = text.translate(str.maketrans('', '', string.punctuation))  # Menghapus tanda baca
    words = word_tokenize(text)  # Tokenisasi
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]  # Lemmatization & stopword removal
    return " ".join(words)

# Preprocessing seluruh dataset
for entry in dataset:
    entry['document'] = clean_text(entry['document'])
    entry['outcome'] = clean_text(entry['outcome'])

# Inisialisasi tokenizer T5
tokenizer = AutoTokenizer.from_pretrained("t5-small")

# Function untuk tokenisasi dengan padding dan truncation
def preprocess_data(example):
    inputs = tokenizer(example['document'], max_length=1024, truncation=True, padding="max_length")
    labels = tokenizer(example['outcome'], max_length=150, truncation=True, padding="max_length")
    inputs["labels"] = labels["input_ids"]
    return inputs

# Tokenisasi dataset
tokenized_dataset = dataset.map(preprocess_data, batched=True)

# Inisialisasi model T5
model = T5ForConditionalGeneration.from_pretrained("t5-small")

# Menggunakan TF-IDF Vectorizer sebagai tambahan fitur
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Mengambil 5000 fitur paling penting
tfidf_matrix = tfidf_vectorizer.fit_transform([entry['document'] for entry in dataset])

# Simpan model, tokenizer, dan TF-IDF vectorizer setelah training
model.save_pretrained("saved_preprocessing_model")
tokenizer.save_pretrained("saved_preprocessing_model")

# Simpan TF-IDF vectorizer menggunakan pickle
with open("saved_preprocessing_model/tfidf_vectorizer.pkl", "wb") as file:
    pickle.dump(tfidf_vectorizer, file)

print("✅ Model, Tokenizer, dan TF-IDF Vectorizer berhasil disimpan di folder 'saved_preprocessing_model/'")


  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Map: 100%|██████████| 100/100 [00:00<00:00, 272.25 examples/s]


✅ Model, Tokenizer, dan TF-IDF Vectorizer berhasil disimpan di folder 'saved_preprocessing_model/'


In [2]:
import pickle
from transformers import AutoTokenizer, T5ForConditionalGeneration
from sklearn.feature_extraction.text import TfidfVectorizer

# Load model yang telah disimpan
model = T5ForConditionalGeneration.from_pretrained("saved_preprocessing_model")
tokenizer = AutoTokenizer.from_pretrained("saved_preprocessing_model")

# Load TF-IDF Vectorizer
with open("saved_preprocessing_model/tfidf_vectorizer.pkl", "rb") as file:
    tfidf_vectorizer = pickle.load(file)

print("✅ Model, Tokenizer, dan TF-IDF Vectorizer berhasil dimuat kembali!")


✅ Model, Tokenizer, dan TF-IDF Vectorizer berhasil dimuat kembali!


In [3]:
# Function untuk membuat ringkasan dengan model yang telah disimpan
def generate_summary(text):
    inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True)
    summary_ids = model.generate(inputs.input_ids, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Contoh penggunaan dengan dokumen yang sudah diproses
sample_paper = dataset[0]['document']
print("\nOriginal Paper (Cleaned):\n", sample_paper)
print("\nGenerated Summary:\n", generate_summary(sample_paper))

# Menggunakan TF-IDF Vectorizer untuk representasi numerik teks
tfidf_features = tfidf_vectorizer.transform([sample_paper])
print("\nTF-IDF Features for Sample Paper:\n", tfidf_features.toarray()[0])



Original Paper (Cleaned):
 In this paper , we explore correlation of dependency relation paths to rank candidate answers in answer extraction . Using the correlation measure , we compare dependency relations of a candidate answer and mapped question phrases in sentence with the corresponding relations in question . Different from previous studies , we propose an approximate phrase mapping algorithm and incorporate the mapping score into the correlation measure . The correlations are further incorporated into a Maximum Entropy-based ranking model which estimates path weights from training . Experimental results show that our method significantly outperforms state-ofthe-art syntactic relation-based methods by up to 20 % in MRR . Answer Extraction is one of basic modules in open domain Question Answering ( QA ) . It is to further process relevant sentences extracted with Passage / Sentence Retrieval and pinpoint exact answers using more linguistic-motivated analysis . Since QA turns to f

In [7]:
# Function untuk membuat ringkasan dengan model yang telah disimpan
def generate_summary(text):
    inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True)
    summary_ids = model.generate(inputs.input_ids, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Contoh penggunaan dengan dokumen yang sudah diproses
sample_paper = dataset[10]['document']
print("\nOriginal Paper (Cleaned):\n", sample_paper)
print("\nGenerated Summary:\n", generate_summary(sample_paper))

# Menggunakan TF-IDF Vectorizer untuk representasi numerik teks
tfidf_features = tfidf_vectorizer.transform([sample_paper])
print("\nTF-IDF Features for Sample Paper:\n", tfidf_features.toarray()[0])



Original Paper (Cleaned):
 Automatically extracting social meaning and intention from spoken dialogue is an important task for dialogue systems and social computing . We describe a system for detecting elements of interactional style : whether a speaker is awkward , friendly , or flirtatious . We create and use a new spoken corpus of 991 4-minute speed-dates . Participants rated their interlocutors for these elements of style . Using rich dialogue , lexical , and prosodic features , we are able to detect flirtatious , awkward , and friendly styles in noisy natural conversational data with up to 75 % accuracy , compared to a 50 % baseline . We describe simple ways to extract relatively rich dialogue features , and analyze which features performed similarly for men and women and which were gender-specific . How can we extract social meaning from speech , deciding if a speaker is particularly engaged in the conversation , is uncomfortable or awkward , or is particularly friendly and flir