# Stratechery Translater

## Read Stratechery file

In [1]:
import sys

def get_file_name():
    # Check the number of command-line arguments
    if len(sys.argv) != 2:
        print("使用方法： python3 myscript.py <filename.html>")
        print("範例： python3 myscript.py example.html")
        sys.exit()

    # Get the first command-line argument
    filename = sys.argv[1]

    # Check if the argument is an HTML file
    if not filename.endswith('.html'):
        print("錯誤：檔案必須是.html檔案")
        sys.exit()

    return filename

In [3]:
def read_html_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as post_file:
        whole_html = post_file.read()
    return whole_html

In [5]:
# file_path = get_file_name()
file_path = '0504.html'
whole_html = read_html_file(file_path)
# print(whole_html)

## Extract article tag from the html

In [None]:
from bs4 import BeautifulSoup

def get_article_tag(html):
    soup = BeautifulSoup(html, 'lxml')

    article_tag = soup.find('article')

    return str(article_tag)

In [6]:
article_tag = get_article_tag(whole_html)

# # Test function for testing the insert functionality

# from IPython.display import display, HTML

# with open('.\\0502.html', 'r', encoding='utf-8') as f:
#     html = f.read()
#     output = insert_modified_article_tag(html, translated)
#     display(HTML(output))

## Apply the article tag to chatGPT to translate

### Import openai library and basic functions

In [7]:
import os
import openai
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

openai.api_key  = os.getenv('OPENAI_API_KEY')

In [8]:
def get_completion(prompt, model="gpt-3.5-turbo"):
    messages = [{"role": "user", "content": prompt}]
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=0, # this is the degree of randomness of the model's output
    )
    return response.choices[0].message["content"]

def get_completion_from_messages(messages, model="gpt-3.5-turbo", temperature=0):
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=temperature, # this is the degree of randomness of the model's output
    )
#     print(str(response.choices[0].message))
    return response.choices[0].message["content"]

### main functions to interact with chatgpt api to translate the html

In [9]:
def read_string_in_chunks(input_string, chunk_size=3500):
    for i in range(0, len(input_string), chunk_size):
        yield input_string[i:i + chunk_size]

def process_with_chatgpt_api(chunk, chatgpt_api_func):
    # Here you would call your chatgpt api function with the chunk as input.
    response = chatgpt_api_func(chunk)
    return response

In [10]:
def translate_article(article_tag):
    chunks = list(read_string_in_chunks(article_tag))
    length = len(chunks)
    
    translated = []
    for i, chunk in enumerate(chunks):
        
        messages =  [  
        {'role':'system', 'content':'You are an technology article professional translater at translating article from English to zh-hant-tw.'},
        {'role':'assistant', 'content':'Ok, I am a professional translator from English to zh-hant-tw.'}
        ]
        
        messages.append({'role':'user', 'content':f"""
        You are being provided a part of html code of an article, it is most likely a part of technology column, \
        but some times it will be something other than that, the content of the html is delimited in three backticks below.
        
        The text you translate will be concat to other translated passage, \
        so make sure to output the full text containing original html code, \
        and do not quote in three backticks.
        
        You have to:
        1. Read through the html code, remember, the passage might seems being cut in half, which is totally normal.
        2. Translate the article inside into zh-hant-TW.
        3. Rewrite the translated article to make it more readible for ZH-HANT-TW reader \
        by changing the word or the word sequence.
        3. Output the translated html code without three backticks quoted.
       
        part of the passage: {i+1} / {length}
        content: ```{chunk}```
        """
        })
        
        response = get_completion_from_messages(messages, temperature=0)
        translated.append(response)    
    
    return translated

In [11]:
translated_array = translate_article(article_tag)

## Parse the array (trimming the unnecessary line break and backticks)

### Trim the backticks and linebreaks

In [21]:
def trim_strings(array):
    return [
        s.lstrip('`\n').rstrip('`\n')
        for s in array
    ]

In [22]:
trimmed_translated_array = trim_strings(translated_array)
print(trimmed_translated_array)

['<article class="post-10915 post type-post status-publish format-standard hentry category-daily-email topics-ai-machine-learning topics-app-tracking-transparency-att topics-earnings topics-events topics-messaging topics-networks topics-open-source topics-social concepts-advertising concepts-business-models companies-facebook companies-meta" id="post-10915">\n<header class="entry-header">\n<h1 class="entry-title">Facebook收益、生成式人工智慧和消息盈利、開放原始碼和人工智慧</h1>\n<div class="entry-meta">\n<span class="posted-on"><span class="screen-reader-text">發佈於</span><time class="entry-date published" datetime="2023-05-03T05:24:12-07:00">2023年5月3日星期三</time><time class="updated" datetime="2023-05-03T08:58:18-07:00">2023年5月3日星期三</time></span> <span class="byline"><span class="author vcard"><span class="screen-reader-text">作者</span>作者：<a class="url fn n" href="https://stratechery.com/author/stratechery/">本·湯普森</a></span></span> </div><!-- .entry-meta -->\n</header><!-- .entry-header -->\n<div class="entry-conte

### Transcribe them form simplified chinese to Traditional (just a quick double check)

In [23]:
import opencc

def simplified_to_traditional(simplified_strings):
    converter = opencc.OpenCC('s2t.json')
    traditional_strings = []

    for string in simplified_strings:
        traditional_string = converter.convert(string)
        traditional_strings.append(traditional_string)

    return traditional_strings

In [24]:
traditional_trimmed_translated_array = simplified_to_traditional(trimmed_translated_array)
print(traditional_trimmed_translated_array)

['<article class="post-10915 post type-post status-publish format-standard hentry category-daily-email topics-ai-machine-learning topics-app-tracking-transparency-att topics-earnings topics-events topics-messaging topics-networks topics-open-source topics-social concepts-advertising concepts-business-models companies-facebook companies-meta" id="post-10915">\n<header class="entry-header">\n<h1 class="entry-title">Facebook收益、生成式人工智慧和消息盈利、開放原始碼和人工智慧</h1>\n<div class="entry-meta">\n<span class="posted-on"><span class="screen-reader-text">發佈於</span><time class="entry-date published" datetime="2023-05-03T05:24:12-07:00">2023年5月3日星期三</time><time class="updated" datetime="2023-05-03T08:58:18-07:00">2023年5月3日星期三</time></span> <span class="byline"><span class="author vcard"><span class="screen-reader-text">作者</span>作者：<a class="url fn n" href="https://stratechery.com/author/stratechery/">本·湯普森</a></span></span> </div><!-- .entry-meta -->\n</header><!-- .entry-header -->\n<div class="entry-conte

### Combine the array into a string

In [17]:
def insert_modified_article_tag(html, modified_article_tag):
    soup = BeautifulSoup(html, 'lxml')

    original_article_tag = soup.find('article')
    if original_article_tag:
        new_article_tag = BeautifulSoup(modified_article_tag, 'lxml').article
        if new_article_tag:
            original_article_tag.replace_with(new_article_tag)

    return str(soup)

In [25]:
translated = ''.join(traditional_trimmed_translated_array)
translated_html = insert_modified_article_tag(whole_html, translated)

### Output the function to a file

In [19]:
from datetime import datetime

def write_to_file(text, optional_name=""):
    
    # Get the current date
    current_date = datetime.now()

    # Format the date as a string in the format "yymmdd"
    date_string = current_date.strftime('%y%m%d')

    # Create the filename
    filename = f"{date_string}-Stratechery{optional_name}.html"

    # Write the text to the file
    with open(filename, 'w', encoding='utf-8') as f:
        f.write(text)      

In [26]:
write_to_file(translated_html, 'fjsdalkferwjksldaf')