# Stratechery Translater

## Read Stratechery file

In [None]:
import sys

def get_file_name():
    # Check the number of command-line arguments
    if len(sys.argv) != 2:
        print("使用方法： python3 myscript.py <filename.html>")
        print("範例： python3 myscript.py example.html")
        sys.exit()

    # Get the first command-line argument
    filename = sys.argv[1]

    # Check if the argument is an HTML file
    if not filename.endswith('.html'):
        print("錯誤：檔案必須是.html檔案")
        sys.exit()

    return filename

In [None]:
def read_html_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as post_file:
        whole_html = post_file.read()
    return whole_html

In [None]:
# file_path = get_file_name()
file_path = '0504.html'
whole_html = read_html_file(file_path)
# print(whole_html)

## Extract article tag from the html

In [None]:
from bs4 import BeautifulSoup

def get_article_tag(html):
    soup = BeautifulSoup(html, 'lxml')

    article_tag = soup.find('article')

    return str(article_tag)

In [None]:
article_tag = get_article_tag(whole_html)

# # Test function for testing the insert functionality

# from IPython.display import display, HTML

# with open('.\\0502.html', 'r', encoding='utf-8') as f:
#     html = f.read()
#     output = insert_modified_article_tag(html, translated)
#     display(HTML(output))

## Apply the article tag to chatGPT to translate

### Import openai library and basic functions

In [None]:
import os
import openai
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

openai.api_key  = os.getenv('OPENAI_API_KEY')

In [None]:
def get_completion(prompt, model="gpt-3.5-turbo"):
    messages = [{"role": "user", "content": prompt}]
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=0, # this is the degree of randomness of the model's output
    )
    return response.choices[0].message["content"]

def get_completion_from_messages(messages, model="gpt-3.5-turbo", temperature=0):
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=temperature, # this is the degree of randomness of the model's output
    )
#     print(str(response.choices[0].message))
    return response.choices[0].message["content"]

### main functions to interact with chatgpt api to translate the html

In [None]:
def read_string_in_chunks(input_string, chunk_size=3500):
    for i in range(0, len(input_string), chunk_size):
        yield input_string[i:i + chunk_size]

def process_with_chatgpt_api(chunk, chatgpt_api_func):
    # Here you would call your chatgpt api function with the chunk as input.
    response = chatgpt_api_func(chunk)
    return response

In [None]:
import threading
import time
from typing import List
from queue import Queue

def translate_article(article_tag: str, num_threads: int) -> List[str]:
    chunks = list(read_string_in_chunks(article_tag))
    length = len(chunks)

    translated = [None] * length
    q = Queue()
    progress = {i: 0 for i in range(length)}

    def worker():
        while True:
            index, chunk = q.get()
            if chunk is None:
                break
            translated[index] = translate_chunk(chunk, index, length, progress)
            q.task_done()

    threads = []
    for _ in range(num_threads):
        t = threading.Thread(target=worker)
        t.start()
        threads.append(t)

    for i, chunk in enumerate(chunks):
        q.put((i, chunk))

    progress_thread = threading.Thread(target=progress_tracker, args=(article_tag, num_threads, progress))
    progress_thread.start()

    q.join()

    for _ in threads:
        q.put((None, None))

    for t in threads:
        t.join()

    progress_thread.join()

    return translated

def translate_chunk(chunk: str, index: int, total: int, progress: dict) -> str:
    messages =  [  
    {'role':'system', 'content':'You are an technology article professional translater at translating article from English to zh-hant-tw.'},
    {'role':'assistant', 'content':'Ok, I am a professional translator from English to zh-hant-tw.'}
    ]
    
    messages.append({'role':'user', 'content':f"""
    You are being provided a part of html code of an article, it is most likely a part of technology column, \
    but some times it will be something other than that, the content of the html is delimited in three backticks below.
    
    The text you translate will be concat to other translated passage, \
    so make sure to output the full text containing original html code, \
    and do not quote in three backticks.
    
    You have to:
    1. Read through the html code, remember, the passage might seems being cut in half, which is totally normal.
    2. Translate the article inside into zh-hant-TW.
    3. Rewrite the translated article to make it more readible for ZH-HANT-TW reader \
    by changing the word or the word sequence.
    3. Output the translated html code without three backticks quoted.
    
    part of the passage: {index+1} / {total}
    content: ```{chunk}```
    """
    })
    
    response = get_completion_from_messages(messages, temperature=0)
    progress[index] = 100

    return response

def read_string_in_chunks(input_string: str, chunk_size: int = 3500) -> str:
    for i in range(0, len(input_string), chunk_size):
        yield input_string[i:i + chunk_size]

def progress_tracker(article_tag: str, num_threads: int, progress: dict):
    total_chars = len(article_tag)
    char_per_minute = 3500
    estimated_time = (total_chars / char_per_minute) * 60 / num_threads
    
    print(f"Estimated time to complete: {estimated_time:.2f} seconds")
    print(f"Total token spent: {total_chars}")
    
    start_time = time.time()
    while sum(progress.values()) < len(progress) * 100:
        time.sleep(10)
        elapsed_time = time.time() - start_time
        completion_percentage = sum(progress.values()) / len(progress)
        print(f"Current progress: {completion_percentage:.2f}%")
        print(f"Time spent: {elapsed_time:.2f} seconds")

In [22]:
num_threads = 10
translated_article_chunks = translate_article(article_tag, num_threads)
translated_article = "".join(translated_article_chunks)

Estimated time to complete: 44.38 seconds
Total token spent: 25887
Current progress: 0.00%
Time spent: 10.01 seconds
Current progress: 12.50%
Time spent: 20.01 seconds
Current progress: 25.00%
Time spent: 30.03 seconds
Current progress: 37.50%
Time spent: 40.04 seconds
Current progress: 37.50%
Time spent: 50.05 seconds
Current progress: 37.50%
Time spent: 60.06 seconds
Current progress: 37.50%
Time spent: 70.06 seconds
Current progress: 37.50%
Time spent: 80.06 seconds
Current progress: 75.00%
Time spent: 90.08 seconds
Current progress: 100.00%
Time spent: 100.09 seconds


## Parse the array (trimming the unnecessary line break and backticks)

### Trim the backticks and linebreaks

In [23]:
def trim_strings(array):
    return [
        s.lstrip('`\n').rstrip('`\n')
        for s in array
    ]

In [24]:
trimmed_translated_array = trim_strings(translated)
print(trimmed_translated_array)

['<', 'a', 'r', 't', 'i', 'c', 'l', 'e', ' ', 'c', 'l', 'a', 's', 's', '=', '"', 'p', 'o', 's', 't', '-', '1', '0', '9', '1', '5', ' ', 'p', 'o', 's', 't', ' ', 't', 'y', 'p', 'e', '-', 'p', 'o', 's', 't', ' ', 's', 't', 'a', 't', 'u', 's', '-', 'p', 'u', 'b', 'l', 'i', 's', 'h', ' ', 'f', 'o', 'r', 'm', 'a', 't', '-', 's', 't', 'a', 'n', 'd', 'a', 'r', 'd', ' ', 'h', 'e', 'n', 't', 'r', 'y', ' ', 'c', 'a', 't', 'e', 'g', 'o', 'r', 'y', '-', 'd', 'a', 'i', 'l', 'y', '-', 'e', 'm', 'a', 'i', 'l', ' ', 't', 'o', 'p', 'i', 'c', 's', '-', 'a', 'i', '-', 'm', 'a', 'c', 'h', 'i', 'n', 'e', '-', 'l', 'e', 'a', 'r', 'n', 'i', 'n', 'g', ' ', 't', 'o', 'p', 'i', 'c', 's', '-', 'a', 'p', 'p', '-', 't', 'r', 'a', 'c', 'k', 'i', 'n', 'g', '-', 't', 'r', 'a', 'n', 's', 'p', 'a', 'r', 'e', 'n', 'c', 'y', '-', 'a', 't', 't', ' ', 't', 'o', 'p', 'i', 'c', 's', '-', 'e', 'a', 'r', 'n', 'i', 'n', 'g', 's', ' ', 't', 'o', 'p', 'i', 'c', 's', '-', 'e', 'v', 'e', 'n', 't', 's', ' ', 't', 'o', 'p', 'i', 'c',

### Transcribe them form simplified chinese to Traditional (just a quick double check)

In [25]:
import opencc

def simplified_to_traditional(simplified_strings):
    converter = opencc.OpenCC('s2t.json')
    traditional_strings = []

    for string in simplified_strings:
        traditional_string = converter.convert(string)
        traditional_strings.append(traditional_string)

    return traditional_strings

In [26]:
traditional_trimmed_translated_array = simplified_to_traditional(trimmed_translated_array)
print(traditional_trimmed_translated_array)

['<', 'a', 'r', 't', 'i', 'c', 'l', 'e', ' ', 'c', 'l', 'a', 's', 's', '=', '"', 'p', 'o', 's', 't', '-', '1', '0', '9', '1', '5', ' ', 'p', 'o', 's', 't', ' ', 't', 'y', 'p', 'e', '-', 'p', 'o', 's', 't', ' ', 's', 't', 'a', 't', 'u', 's', '-', 'p', 'u', 'b', 'l', 'i', 's', 'h', ' ', 'f', 'o', 'r', 'm', 'a', 't', '-', 's', 't', 'a', 'n', 'd', 'a', 'r', 'd', ' ', 'h', 'e', 'n', 't', 'r', 'y', ' ', 'c', 'a', 't', 'e', 'g', 'o', 'r', 'y', '-', 'd', 'a', 'i', 'l', 'y', '-', 'e', 'm', 'a', 'i', 'l', ' ', 't', 'o', 'p', 'i', 'c', 's', '-', 'a', 'i', '-', 'm', 'a', 'c', 'h', 'i', 'n', 'e', '-', 'l', 'e', 'a', 'r', 'n', 'i', 'n', 'g', ' ', 't', 'o', 'p', 'i', 'c', 's', '-', 'a', 'p', 'p', '-', 't', 'r', 'a', 'c', 'k', 'i', 'n', 'g', '-', 't', 'r', 'a', 'n', 's', 'p', 'a', 'r', 'e', 'n', 'c', 'y', '-', 'a', 't', 't', ' ', 't', 'o', 'p', 'i', 'c', 's', '-', 'e', 'a', 'r', 'n', 'i', 'n', 'g', 's', ' ', 't', 'o', 'p', 'i', 'c', 's', '-', 'e', 'v', 'e', 'n', 't', 's', ' ', 't', 'o', 'p', 'i', 'c',

### Combine the array into a string

In [27]:
def insert_modified_article_tag(html, modified_article_tag):
    soup = BeautifulSoup(html, 'lxml')

    original_article_tag = soup.find('article')
    if original_article_tag:
        new_article_tag = BeautifulSoup(modified_article_tag, 'lxml').article
        if new_article_tag:
            original_article_tag.replace_with(new_article_tag)

    return str(soup)

In [28]:
translated = ''.join(traditional_trimmed_translated_array)
translated_html = insert_modified_article_tag(whole_html, translated)

### Output the function to a file

In [29]:
from datetime import datetime

def write_to_file(text, optional_name=""):
    
    # Get the current date
    current_date = datetime.now()

    # Format the date as a string in the format "yymmdd"
    date_string = current_date.strftime('%y%m%d')

    # Create the filename
    filename = f"{date_string}-Stratechery{optional_name}.html"

    # Write the text to the file
    with open(filename, 'w', encoding='utf-8') as f:
        f.write(text)      

In [30]:
write_to_file(translated_html, 'fjsdalkferwjksldaf')