In [None]:
from google.colab import drive
import pandas as pd
import datetime

drive.mount('drive')

# Youtube Scraping

In [None]:
!pip install youtube_transcript_api scrapetube

In [None]:
target_lang_code = 'th' # or other LRLs such as hi (hindi), id (indo)

In [None]:
import scrapetube
from youtube_transcript_api import YouTubeTranscriptApi
import time

In [None]:
channel_category = {
    "TED": "Education",
    "kurzgesagt": "Science and Technology",
    #"NickDiGiovanni": "Food",
    "MrBeast": "Entertainment",
    "TEDEd": "Education",
    "mrnigelng": "Comedy",
    "RetiredWorkingForYou": "Vlogging",
    "Bearhugsk": "Vlogging"
}
video_per_channel = 300

In [None]:
def get_transcript_list_from_channel(username, category,id="", sort_by="newest", limit=10):
  print(username)
  videos = None

  if (id !=""):
    videos = scrapetube.get_channel(id,sort_by=sort_by,limit=limit)
  else: videos = scrapetube.get_channel(channel_username=username,sort_by=sort_by,limit=limit)

  full_transcript = []

  for video in videos:
    data = []
    count = 0
    try:

      th = YouTubeTranscriptApi.get_transcript(video['videoId'], languages=[target_lang_code])
      en = YouTubeTranscriptApi.get_transcript(video['videoId'], languages=['en'])
      print(th)
      print(len(th), len(en))
      if (len(th) != len(en) + 1): continue

      for i in range(len(en)):
        if (i == 0):
          sentence = {
              "en": en[i]['text'],
              "th": th[i+1]['text'],
              "category": category
          }
          count += 1
          data.append(sentence)
        else:
          previousLastChar = data[count - 1]['en'][-1]
          if (previousLastChar != "."):
            data[count - 1]['en'] += en[i]['text']
            data[count - 1]['th'] += th[i+1]['text']
          else:
            sentence = {
              "en": en[i]['text'],
              "th": th[i+1]['text'],
              "category": category
            }
            count += 1
            data.append(sentence)
      full_transcript += data

    except Exception as e:
      print(e)
      continue
  return full_transcript

In [None]:
all_data = []

In [None]:
ted_data = get_transcript_list_from_channel("TED",category="Education",sort_by="popular",limit=video_per_channel)
kurzgesagt_data = get_transcript_list_from_channel("kurzgesagt",sort_by="popular",category="Science and Technology",limit=video_per_channel)
nick_data = get_transcript_list_from_channel("NickDiGiovanni",category="Food",limit=video_per_channel)
bearhug_data = get_transcript_list_from_channel(username="",id="UCOqODGR-AoOTkxB74sc1Xyw",category="Vlogging",limit=2)

In [None]:
all_data = []
for channel, category in channel_category.items():
  all_data += get_transcript_list_from_channel(channel,category=category,limit=video_per_channel)
  time.sleep(300)

df = pd.DataFrame.from_records(all_data)
display(df)

title = datetime.datetime.now().strftime("%d%H%M") + "_YUP_channels_raw.csv"

df.to_csv(title)
!cp "$title" "drive/My Drive/"

In [None]:
df = pd.DataFrame.from_records(all_data)
display(df)

title = datetime.datetime.now().strftime("%d%H%M") + "_YUP_channels_raw.csv"

df.to_csv(title)
!cp "$title" "drive/My Drive/"

# Clean DATA

In [None]:
import re
import pandas as pd

your_file = title

In [None]:
patterns = [
        (r'[A-Z][A-Z][A-Z]:', ' '),
        (r'[A-Z][a-z]+:', ' '),
        (r'[A-Z][A-Z]:', ' '),
        (r'[A-Z]:', ' '),
        (r'[A-Z][A-Z][A-Z] :', ' '),
        (r'[A-Z][a-z]+ :', ' '),
        (r'[A-Z][A-Z] :', ' '),
        (r'[A-Z] :', ' '),
        (r'[A-Z][A-Z][A-Z]：', ' '),
        (r'[A-Z][A-Z]：', ' '),
        (r'[A-Z]：', ' '),
        (r"[^)]*\）", " "),
        (r"\([^)]*\)", " "),
        (r"\（[^)]*\）", " "),
        (r"\.", " "),
        (r'[-+:!="^\'\[\]\(\)]', " "),
        (r",", ", "),
        (r" ,", ", "),
        (r"\"", ""),
        (r"\'", ""),
        (r"\”", ""),
        (r"\“", ""),
        (r"  +", " "),
        (r"\n", " "),

        # Can add more
]

In [None]:

def clean_data(column_data):
    # Check null
    if pd.isnull(column_data):
        return column_data

    # Convert non-string to string
    if not isinstance(column_data, str):
        column_data = str(column_data)



    for pattern, replacement in patterns:
        column_data = re.sub(pattern, replacement, column_data)

    return column_data.strip()

In [None]:
df = pd.read_csv(your_file)

# clean_data
df_cleaned = df.applymap(clean_data)

# Print sample
display(df_cleaned)

# Save

title = datetime.datetime.now().strftime("%d%H%M") + "_YUP_cleaned.csv"

df = pd.DataFrame.from_records(df_cleaned)
df.to_csv(title)
!cp "$title" "drive/My Drive/"

# Dataset Translation

In [None]:
!pip install deep-translator

In [None]:
from deep_translator import GoogleTranslator

In [None]:
df = pd.read_csv(title)
df = df.drop(["Unnamed: 0","Unnamed: 0.1"],axis=1)

In [None]:
# translate

def translate(x):
  try:
    x["en-th"] = GoogleTranslator(source='en', target='th').translate(x["en"])
  except:
    x["en-th"] = None
  return x

In [None]:
df = df.apply(lambda x: translate(x), axis=1)

title = datetime.datetime.now().strftime("%d%H%M") + "_YUP_translated.csv"

df.to_csv(title)
!cp "$title" "drive/My Drive/"

In [None]:
!cp "$title" "drive/My Drive/"

# Instruct Mining Experiment

In [None]:
!pip install accelerate transformers -U sentencepiece transformers -qq seqeval -qq datasets -qq evaluate bert_score parascore==1.0.5 pynndescent

In [None]:
import numpy as np
from datasets import load_dataset, load_metric
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification

In [None]:
# Load model and tokenizer
model_checkpoint = "google/mt5-base"

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained("gpt2", return_tensors="pt",add_prefix_space=True)

In [None]:
dataset = load_dataset("pichykh/YUP_Parallel")['train']
dataset

In [None]:
# Sample random subsets

subset_sizes = 1000
subsets = []
for i in range(8):
  subset = dataset.shuffle(seed=42).select(range(1000))
  subsets.append(subset)

In [None]:
from evaluate import load

bertscore = load("bertscore")

In [None]:
import numpy as np

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # In case the model returns more than the prediction logits
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100s in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]

    result = bertscore.compute(predictions=decoded_preds, references=decoded_labels, lang="th")
    return {
        "precision": result["precision"],
      }

In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from parascore import ParaScorer
import pynndescent

In [None]:
# Compute Indicators

def compute_indicators(input_dataset, result_df, embbeded):
  # ParaScorer
  scorer = ParaScorer(lang="th", model_type = 'bert-base-uncased')
  print(input_dataset['en-th'])
  cands = [str(input_str) for input_str in input_dataset['en-th']]
  sources = [str(input_str) for input_str in input_dataset['th']]
  score = scorer.free_score(cands, sources)

  df['para'] = score[0]
  # KNN

  input_data = [(np.pad(np.array(e), pad_width=(0,32-len(e)), constant_values=0)) for e in embbeded['input_ids']]
  input_data = np.array(input_data)

  index = pynndescent.NNDescent(input_data)
  result_df['knn'] = index.neighbor_graph[1][:, 6]

In [None]:
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainingArguments,Seq2SeqTrainer

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
max_length = 32
results = []
def preprocess_function(dataset):
  inputs = [str(input_str) for input_str in dataset['en-th']]
  targets = [str(target_str) for target_str in dataset['th']]

  model_inputs = tokenizer(
        inputs, text_target=targets, max_length=max_length, truncation=True
    )
  return model_inputs


args = Seq2SeqTrainingArguments(
    evaluation_strategy="no",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    output_dir="/results"
)

for subset in subsets:
  split_datasets = subset.train_test_split(train_size=0.9)
  split_datasets["validation"] = split_datasets.pop("test")

  tokenized_datasets = split_datasets.map(
    preprocess_function,
    batched=True,
    remove_columns=split_datasets["train"].column_names,
  )

  df = pd.DataFrame()

  trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
  )

  #finetune
  trainer.train()
  #df[''] = trainer.evaluate(max_length=max_length)['eval_precision']

  df['bertscore'] = trainer.evaluate(max_length=max_length)['eval_precision']
  compute_indicators(split_datasets['validation'],df,tokenized_datasets['validation'])

  X = df[['knn','para']]
  y = df['bertscore']

  reg = LinearRegression().fit(X, y)
   # Record model performance
  r2 = reg.score(X, y)
  coeff = reg.coef_
  results.append({
    'subset': subset,
    'r2': r2,
    'coeff': coeff
  })

  del trainer
  del tokenized_datasets
  del split_datasets


Parascorer and KNN appears to not be significant to dataset quality

Require more dataset and experiments

# Augment Dataset (optional)

In [None]:
! pip install openai

In [None]:
import openai
import pandas as pd
import re
import time

openai_key = input('openai API key')

In [None]:
def generate_paraphrases(api_key, input_csv_path, output_csv_path, num_rows=99, chunk_size=3):
    # Setting
    openai.api_key = api_key
    column_name = "th"
    num_paraphrases = 2
    x = 0
    # Load CSV
    df = pd.read_csv(input_csv_path)
    df = df.head(num_rows).dropna()  # Process only the first 99 rows

    # Add new columns for paraphrases
    df["GPT1"] = ""
    df["GPT2"] = ""

    # Process data in chunks
    for chunk_start in range(0, len(df), chunk_size):
        chunk_end = min(chunk_start + chunk_size, len(df))
        chunk_df = df.iloc[chunk_start:chunk_end]

        for index, row in chunk_df.iterrows():
            thai_word = row[column_name]

            # Prompt
            prompt = f'Generate {num_paraphrases} paraphrases with the same meaning for the Thai word "{thai_word}" in Thai language without changing any meaning'

            # Request
            try:
                response = openai.Completion.create(
                    engine="text-davinci-003",
                    prompt=prompt,
                    max_tokens=3000,  # Set the MAX tokens
                )
            except openai.error.RateLimitError as e:
                print(f"Rate limit reached. Waiting for 60 seconds before retrying.")
                time.sleep(60)
                response = openai.Completion.create(
                    engine="text-davinci-003",
                    prompt=prompt,
                )

            # Get paraphrases
            paraphrases = []
            if "choices" in response and response["choices"]:
                paraphrases = [
                    phrase.strip()
                    for phrase in response["choices"][0]["text"].split("\n")
                    if phrase.strip()
                ]

            # A little bit of cleaning
            cleaned_paraphrases = [
                re.sub(r"\d+\.\s*", "", phrase) for phrase in paraphrases
            ]

            # Save paraphrases in the new columns
            if len(cleaned_paraphrases) >= 2:
                df.at[index, "GPT1"] = cleaned_paraphrases[0]
                df.at[index, "GPT2"] = cleaned_paraphrases[1]

            # Pause for a moment before the next iteration
            time.sleep(20) # (RPM): Limit 3
            x += 1
            print(f"เสร็จแล้วนิดนึง({x})")

    # Save to CSV
    df.to_csv(output_csv_path, index=False)
    !cp "$title" "drive/My Drive/"
    print("เสร็จแล้วโว้ยยยย")


In [None]:
api_key = openai_key
input_csv_path = title
title = datetime.datetime.now().strftime("%d%H%M") + "_YUP_translated_GPT.csv"
output_csv_path = title
num_rows = 100

In [None]:
generate_paraphrases(api_key, input_csv_path, output_csv_path, num_rows=54, chunk_size=3)

# Fine tuning

In [None]:
!pip install datasets
!pip install sentence-transformers

In [None]:
from datasets import Dataset, DatasetDict
import pandas as pd
from sentence_transformers import SentenceTransformer, models
from sentence_transformers import InputExample
from torch.utils.data import DataLoader
from sentence_transformers import losses

model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

In [None]:
df = pd.read_csv(title)
df = df.drop(["Unnamed: 0"],axis = 1) #Drop unnecessary attribute
df = df.dropna() #Drop NAN value

In [None]:
# mapping education, Science and Technology
category_mapping = {'Education': 0, 'Science and Technology': 1}
df['category'] = df['category'].map(category_mapping)

In [None]:
#Implement dataset from dataframe to DatasetDict
dataset = Dataset.from_pandas(df)
dataset_dict = DatasetDict({'train': dataset})

In [None]:
from sentence_transformers import InputExample
train_examples = []
train_data = dataset_dict['train']

n_examples = dataset_dict['train'].num_rows #Number of train data

for i in range(n_examples):
  example = train_data[i]
  train_examples.append(InputExample(texts=[example['en'], example['th']])) #Append pair en with th
  train_examples.append(InputExample(texts=[example['th'], example['en-th']]))  #Append pair th with th that translate from en

In [None]:
train_dataloader = DataLoader(train_examples,shuffle = True, batch_size = 64) # Load data by DataLoader
train_loss = losses.MultipleNegativesRankingLoss(model=model) #Using loss function

In [None]:
num_epochs = 3
warmup_steps = int(len(train_dataloader) * num_epochs * 1) #1 mean 100%

In [None]:
model.fit(train_objectives=[(train_dataloader, train_loss)],epochs=num_epochs,warmup_steps=warmup_steps) #fit model

In [None]:
from datasets import load_dataset
# load dataset for evaluate
Evaluate_dataset = load_dataset("Patt/copa_th")

In [None]:
Evaluate_dataset['train'] # Attribute in dataset

In [None]:
train_eva_examples = []
train_eva_data = Evaluate_dataset['train']

n_examples = 100 #Using 100 dataset

#Append list of example_inside into train_eva_examples

for i in range(n_examples):
    example = train_eva_data[i]
    example_inside = []
    example_inside.append(example['premise']) #en
    example_inside.append(example['premise_th']) #th
    example_inside.append(example['score_premise']) #score similarity
    train_eva_examples.append(example_inside) #Append example_inside[i] into train_eva_examples

In [None]:
train_eva_examples[0] # This is the characteristic of this variable

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
# Evalaute our fine-tuned model
overall = 0 # the total of difference between our prediction and exact value
for j in range(len(train_eva_examples)):
    sen_embedding = model.encode(train_eva_examples[j]) #Encode for compare similarity
    compare = cosine_similarity(sen_embedding[0].reshape(1,-1),sen_embedding[1].reshape(1,-1))[0][0] #Compare similarity between th and en
    score_premise = train_eva_examples[j][2] #Exact value predict expected
    different = abs(compare - score_premise) #difference between our prediction and exact value
    overall = overall + different
    print(f"This is the result of {j+1} data: {compare} this is predict expected: {score_premise} different around: {different}\n")

In [None]:
print(f"The total difference between prediction and expected will be: {overall}")
# The total of difference between our prediction and exact value for our YUP dataset

In [None]:
modelB = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2') #model when it's still not fine-tune by our dataset

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
# Evaluate pre-trained model
overall_normal = 0 # the total of difference between our prediction and exact value
for j in range(len(train_eva_examples)):
    sen_embedding = modelB.encode(train_eva_examples[j]) #Encode for compare similarity
    compare = cosine_similarity(sen_embedding[0].reshape(1,-1),sen_embedding[1].reshape(1,-1))[0][0] #Compare similarity between th and en
    score_premise = train_eva_examples[j][2] #Exact value predict expected
    different = abs(compare - score_premise) #difference between model prediction and exact value
    overall_normal = overall_normal + different
    print(f"This is the result of {j+1} data: {compare} this is predict expected: {score_premise} different around: {different}\n")

In [None]:
print(f"The total difference between prediction and expected will be: {overall_normal}")
# The total of difference between pre-trained model prediction and exact value without fine-tuned with our model

In [None]:
import torch
save_path = 'model.pth'
torch.save(model.state_dict(), save_path) #Save model that already fine-tune with our dataset