Library imports

In [47]:
!pip install nltk
!pip install spacy
!pip install stanfordnlp



Download the Stanford Core NLP Chinese and English model

Note: You have to manually type 'Y' and press enter to run this cell below


In [48]:
import stanfordnlp
stanfordnlp.download('zh')

Using the default treebank "zh_gsd" for language "zh".
Would you like to download the models for: zh_gsd now? (Y/n)
y

Default download directory: /root/stanfordnlp_resources
Hit enter to continue or type an alternate directory.


Downloading models for: zh_gsd
Download location: /root/stanfordnlp_resources/zh_gsd_models.zip


100%|██████████| 234M/234M [00:34<00:00, 5.60MB/s]



Download complete.  Models saved to: /root/stanfordnlp_resources/zh_gsd_models.zip
Extracting models file for: zh_gsd
Cleaning up...Done.


Note: You have to manually type 'Y' and press enter to run this cell below


In [49]:
import stanfordnlp
stanfordnlp.download('en')

Using the default treebank "en_ewt" for language "en".
Would you like to download the models for: en_ewt now? (Y/n)
y

Default download directory: /root/stanfordnlp_resources
Hit enter to continue or type an alternate directory.


Downloading models for: en_ewt
Download location: /root/stanfordnlp_resources/en_ewt_models.zip


100%|██████████| 235M/235M [01:01<00:00, 3.72MB/s]



Download complete.  Models saved to: /root/stanfordnlp_resources/en_ewt_models.zip
Extracting models file for: en_ewt
Cleaning up...Done.


## Setup

Download datasets:

In [0]:
from os.path import exists

if not exists('enzh_data.zip'):
    !wget -O enzh_data.zip https://competitions.codalab.org/my/datasets/download/03e23bd7-8084-4542-997b-6a1ca6dd8a5f
    !unzip enzh_data.zip

Check data downloaded successfully:

In [51]:
with open("./train.enzh.src", "r") as enzh_src:
  print("Source: ",enzh_src.readline())
with open("./train.enzh.mt", "r") as enzh_mt:
  print("Translation: ",enzh_mt.readline())
with open("./train.enzh.scores", "r") as enzh_scores:
  print("Score: ",enzh_scores.readline())

Source:  The last conquistador then rides on with his sword drawn.

Translation:  最后的征服者骑着他的剑继续前进.

Score:  -1.5284005772625449



### English Models Setup

Download English models:

In [52]:
!spacy download en_core_web_md
!spacy link en_core_web_md en300

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_md')

[38;5;1m✘ Link 'en300' already exists[0m
To overwrite an existing link, use the --force flag



In [0]:
import torchtext
import spacy

nlp_en = spacy.load('en300')

Functions for processing English dataset:

In [54]:
import numpy as np
import torch
from nltk import download
from nltk.corpus import stopwords

#downloading stopwords from the nltk package
download('stopwords') #stopwords dictionary, run once
stop_words_en = set(stopwords.words('english'))

def get_sentence_array(f):
    file = open(f) 
    lines = file.readlines() 
    return lines 

def preprocess_en(sentence, nlp):
    text = sentence.lower()
    doc = [token.lemma_ for token in  nlp.tokenizer(text)]
    doc = [word for word in doc if word not in stop_words_en]
    doc = [word for word in doc if word.isalpha()] #restricts string to alphabetic characters only
    return doc

def get_english_sentences(f, nlp, preprocess=True):
  file = open(f) 
  lines = file.readlines() 
  sentences = []

  for l in lines:
    if preprocess: 
      sentence = preprocess_en(l, nlp)
    else: 
      sentence = [token.lemma_ for token in  nlp.tokenizer(l)]
      sentence = list(filter(None, sentence))
    sentences.append(sentence)

  return sentences


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Chinese Models Setup

Download Chinese stopwords:

In [55]:
!wget -c https://github.com/Tony607/Chinese_sentiment_analysis/blob/master/data/chinese_stop_words.txt

--2020-02-27 21:14:21--  https://github.com/Tony607/Chinese_sentiment_analysis/blob/master/data/chinese_stop_words.txt
Resolving github.com (github.com)... 192.30.253.113
Connecting to github.com (github.com)|192.30.253.113|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/html]
Saving to: ‘chinese_stop_words.txt’

chinese_stop_words.     [   <=>              ] 419.55K   480KB/s    in 0.9s    

2020-02-27 21:14:23 (480 KB/s) - ‘chinese_stop_words.txt’ saved [429623]



Functions for processing Chinese dataset:

In [0]:
import string
import jieba
import gensim 
import spacy
import numpy as np

stop_words = [ line.rstrip() for line in open('./chinese_stop_words.txt',"r", encoding="utf-8") ]

def processing_zh(sentence):
  seg_list = jieba.lcut(sentence,cut_all=True)
  doc = [word for word in seg_list if word not in stop_words]
  docs = [e for e in doc if e.isalnum()]
  return docs


def get_chinese_sentences(f, preprocess=True):
  file = open(f) 
  lines = file.readlines() 
  sentences =[]
  for l in lines:
    if preprocess: 
      sent  = processing_zh(l)
    else: 
      sent = jieba.lcut(l,cut_all=True)
      sent = list(filter(None, sent))
    sentences.append(sent)
  return sentences

## Feature Extraction: Complexity / Fluency of Translation

Extract number of tokens in source segment and target segment


In [0]:
def get_num_token(data_set_token):
  num_tokens = []
  for sentence in data_set_token: 
    num_tokens.append(len(sentence))
  return num_tokens

N-gram language model probability of source segment using the source side of the parallel corpus used to train the MT system as LM

In [0]:
from nltk import bigrams, trigrams
from collections import Counter, defaultdict

def get_lm_prob_src(dataset):

  num_src_prob = []
  model = defaultdict(lambda: defaultdict(lambda: 0))

  # Count frequency of co-occurance and store in dictionary 
  for sentence in dataset:
      for w1, w2, w3 in trigrams(sentence, pad_right=True, pad_left=True):
          model[(w1, w2)][w3] += 1

  # Transform the counts to probabilities over total count 
  for w1_w2 in model:
      total_count = float(sum(model[w1_w2].values()))
      for w3 in model[w1_w2]:
          model[w1_w2][w3] /= total_count

  # Calculate the trigram estimate of sentence probabilities  
  # p(Sentence) = P(W1, ... Wn)
  for sentence in dataset:
    sentence_prob = 1
    for i in range(len(sentence) - 2):
      sentence_prob = sentence_prob * dict(model[sentence[i], sentence[i + 1]])[sentence[i + 2]]
    num_src_prob.append(sentence_prob)

  return num_src_prob


Load a large corpus of the target language to build the LM

In [59]:
# Load the Chinese Corpus

!wget http://pcai056.informatik.uni-leipzig.de/downloads/corpora/zho-simp-tw_web_2014_10K.tar.gz
!tar -zxvf zho-simp-tw_web_2014_10K.tar.gz

zho_sentence_path = "./zho-simp-tw_web_2014_10K/zho-simp-tw_web_2014_10K-sentences.txt"

--2020-02-27 21:14:25--  http://pcai056.informatik.uni-leipzig.de/downloads/corpora/zho-simp-tw_web_2014_10K.tar.gz
Resolving pcai056.informatik.uni-leipzig.de (pcai056.informatik.uni-leipzig.de)... 139.18.2.216
Connecting to pcai056.informatik.uni-leipzig.de (pcai056.informatik.uni-leipzig.de)|139.18.2.216|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3915364 (3.7M) [application/x-gzip]
Saving to: ‘zho-simp-tw_web_2014_10K.tar.gz.2’


2020-02-27 21:14:28 (1.70 MB/s) - ‘zho-simp-tw_web_2014_10K.tar.gz.2’ saved [3915364/3915364]

zho-simp-tw_web_2014_10K/
zho-simp-tw_web_2014_10K/zho-simp-tw_web_2014_10K-sources.txt
zho-simp-tw_web_2014_10K/zho-simp-tw_web_2014_10K-words.txt
zho-simp-tw_web_2014_10K/zho-simp-tw_web_2014_10K-inv_w.txt
zho-simp-tw_web_2014_10K/zho-simp-tw_web_2014_10K-sentences.txt
zho-simp-tw_web_2014_10K/zho-simp-tw_web_2014_10K-import.sql
zho-simp-tw_web_2014_10K/zho-simp-tw_web_2014_10K-co_n.txt
zho-simp-tw_web_2014_10K/zho-simp-tw_web_2014_

In [0]:
def pre_process_zho_chinese_sentences(f):
  file = open(f) 
  lines = file.readlines() 
  lines = lines[1:]
  sentences = []
  chop_counter = 1
  for i, l in enumerate(lines):
    if i >= pow(10, chop_counter): 
      chop_counter += 1
    processed_sentence = processing_zh(l[(chop_counter + 1):])
    sentences.append(processed_sentence)
  return sentences


Build N Gram Language Model probability of target segment with larger dataset and input mt corpus 

In [0]:
from nltk import bigrams, trigrams
from collections import Counter, defaultdict

def get_lm_prob_target(dataset):

  processed_sentences = pre_process_zho_chinese_sentences(zho_sentence_path)

  processed_sentences.extend(dataset)

  num_mt_prob = []
  model = defaultdict(lambda: defaultdict(lambda: 0))

  # Count frequency of co-occurance and store in dictionary
  for sentence in processed_sentences:
      for w1, w2, w3 in trigrams(sentence, pad_right=True, pad_left=True):
          model[(w1, w2)][w3] += 1

  # Transform the counts to probabilities over total count 
  for w1_w2 in model:
      total_count = float(sum(model[w1_w2].values()))
      for w3 in model[w1_w2]:
          model[w1_w2][w3] /= total_count

  # Calculate the trigram estimate of sentence probabilities  
  # p(Sentence) = P(W1, ... Wn)
  for sentence in dataset:
    for i in range(len(sentence) - 2):
      sentence_prob = 1
      sentence_prob = dict(model[sentence[i], sentence[i + 1]])[sentence[i + 2]]
    num_mt_prob.append(sentence_prob)

  return num_mt_prob

# Feature Extraction: Adequacy of Translation

Ratio of number of tokens in source and target segments

In [0]:
def get_ratio_num_token(num_src_tokens, num_mt_tokens):
  ratio_token_src_mt = list(map(lambda src, mt: src / mt, num_src_tokens, num_mt_tokens)) 
  return ratio_token_src_mt

Ratio of brackets and punctuation symbols in source and target segments

In [0]:
import collections as ct

def get_punc_ratio_sent(src_sentence, mt_sentence):

  src_dict = {c:val for c, val in ct.Counter(src_sentence).items() if c in string.punctuation}
  mt_dict = {c:val for c, val in ct.Counter(mt_sentence).items() if c in string.punctuation}

  if sum(mt_dict.values())!= 0: 
    return sum(src_dict.values()) / sum(mt_dict.values()) 
  else:
    return 0

def get_ratio_punc(pro_dataset_src, pro_dataset_mt):
  ratio_punc_src_mt = list(map(get_punc_ratio_sent, pro_dataset_src, pro_dataset_mt))
  return ratio_punc_src_mt

Ratio of open class words, closed class words and other in the source & target segments defined in https://universaldependencies.org/u/pos/

Ratio of percentage of nouns / verbs etc... in the source and target segments

Difference between the depth of the syntactic trees of the source and target segments 

In [0]:
import stanfordnlp
import warnings
from itertools import chain 
from collections import Counter, defaultdict

warnings.filterwarnings("ignore", category=UserWarning)

open_class_words = ["ADJ", "ADV", "INTJ", "NOUN", "PROPN", "VERB"]
closed_class_words = ["ADP", "AUX", "CCONJ", "DET", "NUM", "PART", "PRON", "SCONJ"]
other_words = ["PUNCT", "SYM", "X"]

all_tag_list = list(chain(open_class_words, closed_class_words, other_words))

def get_coarse_tag_count(sentence): 

  sent_open_class_words = 0
  sent_closed_class_words = 0
  sent_other_words = 0

  for word in sentence:
    if word.upos in open_class_words:
      sent_open_class_words += 1
    elif word.upos in closed_class_words:
      sent_closed_class_words += 1
    else: 
      sent_other_words += 1

  return sent_open_class_words, sent_closed_class_words, sent_other_words

def get_fine_tag_count(sentence): 

  tag_dict = {
    "ADJ": 0, 
    "ADV": 0,
    "INTJ": 0, 
    "NOUN": 0, 
    "PROPN": 0, 
    "VERB": 0,
    "ADP": 0, 
    "AUX": 0,
    "CCONJ": 0, 
    "DET": 0, 
    "NUM": 0, 
    "PART": 0, 
    "PRON": 0, 
    "SCONJ": 0,
    "PUNCT": 0, 
    "SYM": 0, 
    "X": 0
  }

  tag_list = [word.upos for word in sentence] 

  sentence_dict = dict(Counter(tag_list))
  
  tag_dict.update(sentence_dict) 
  return tag_dict

def gen_tree(nodes, root):
  tree = defaultdict(dict)

  for child, parent in nodes: 
    tree[parent][child] = tree[child]

  try:
    children, parents = zip(*nodes)
    root = set(parents).difference(children).pop()
  except:
    print(nodes)
    raise NameError('HiThere')

  return {root: tree[root]}

def dict_depth(tree_dict): 
    if isinstance(tree_dict, dict):    
        return 1 + (max(map(dict_depth, tree_dict.values())) 
                                    if tree_dict else 0) 
    return 0

def get_dependency_count(sentence): 

  root_relation = "root"
  nodes = []
  for word in sentence: 
    if word.dependency_relation == 'root':
      root_relation = word.text
    else: 
      nodes.append((word.text, word.governor))
  
  if not nodes: 
    return {}

  tree_dict = gen_tree(nodes, root_relation)
  return tree_dict


def get_tag_dep_ratio(src_sentence, mt_sentence):

  # Process sentences
  src_doc = nlp_stan_en(src_sentence)
  mt_doc = nlp_stan_chinese(mt_sentence)

  # Get dependency counts 
  src_tree_dict = get_dependency_count(src_doc.sentences[0].words)
  mt_tree_dict = get_dependency_count(mt_doc.sentences[0].words) 

  depth_diff = abs(dict_depth(src_tree_dict) - dict_depth(mt_tree_dict))

  # Get course tag ratios
  src_oc_w, src_cc_w, src_o_w = get_coarse_tag_count(src_doc.sentences[0].words)
  mt_oc_w, mt_cc_w, mt_o_w = get_coarse_tag_count(mt_doc.sentences[0].words)

  oc_ratio = src_oc_w / mt_oc_w if mt_oc_w != 0 else 0 
  cc_ratio = src_cc_w / mt_cc_w if mt_cc_w != 0 else 0 
  o_ratio = src_o_w / mt_o_w if mt_o_w != 0 else 0 

  # Get fine tag ratios
  src_tag_dict = get_fine_tag_count(src_doc.sentences[0].words)
  mt_tag_dict = get_fine_tag_count(mt_doc.sentences[0].words)

  ratio_tags_fine_grain_sent = []
  for tag in all_tag_list:
    ratio = src_tag_dict[tag] / mt_tag_dict[tag] if mt_tag_dict[tag] != 0 else 0 
    ratio_tags_fine_grain_sent.append(ratio)

  ratio_dict = {
      "oc_ratio": oc_ratio, 
      "cc_ratio": cc_ratio,
      "o_ratio": o_ratio,
      "ratio_tags_fine_grain_sent": ratio_tags_fine_grain_sent,
      "depth_diff": depth_diff
  }

  return ratio_dict

def get_ratio_tags_depend(raw_dataset_src, raw_dataset_mt):

  ratio_list_of_dict = list(map(get_tag_dep_ratio, raw_dataset_src, raw_dataset_mt))

  ratio_oc_tags = [] 
  ratio_cc_tags = []
  ratio_o_tags = []
  ratio_tags_fine = []
  depth_diff = []

  for ratio_dict in ratio_list_of_dict:
    ratio_oc_tags.append(ratio_dict['oc_ratio'])
    ratio_cc_tags.append(ratio_dict['cc_ratio'])
    ratio_o_tags.append(ratio_dict['o_ratio'])
    ratio_tags_fine.append(ratio_dict['ratio_tags_fine_grain_sent'])
    depth_diff.append(ratio_dict['depth_diff'])

  return ratio_oc_tags, ratio_cc_tags, ratio_o_tags, ratio_tags_fine, depth_diff



# Generate the feature vector

Load the data sets to generate the feature vectors 

In [0]:
import spacy
import torchtext
import numpy as np
from torchtext import data

# Train 
zh_train_mt = get_chinese_sentences("./train.enzh.mt", True)
zh_train_src = get_english_sentences("./train.enzh.src", nlp_en, True)

f_train_scores = open("./train.enzh.scores", 'r')
zh_train_scores = f_train_scores.readlines()

pro_zh_train_mt = get_chinese_sentences("./train.enzh.mt", False)
pro_zh_train_src = get_english_sentences("./train.enzh.src", nlp_en, False)

raw_zh_train_mt = get_sentence_array("./train.enzh.mt")
raw_zh_train_src = get_sentence_array("./train.enzh.src")

# Validation
zh_val_mt = get_chinese_sentences("./dev.enzh.mt")
zh_val_src = get_english_sentences("./dev.enzh.src", nlp_en)

f_val_scores = open("./dev.enzh.scores", 'r')
zh_val_scores = f_val_scores.readlines()

pro_zh_val_mt = get_chinese_sentences("./dev.enzh.mt", False)
pro_zh_val_src = get_english_sentences("./dev.enzh.src", nlp_en, False)

raw_zh_val_mt = get_sentence_array("./dev.enzh.mt")
raw_zh_val_src = get_sentence_array("./dev.enzh.src")


Generate the complexity feature set

In [0]:
# Train 
num_token_src_train = get_num_token(zh_train_src)
# Normalise 
num_token_src_train = (num_token_src_train - np.min(num_token_src_train)) / (np.max(num_token_src_train) - np.min(num_token_src_train))
lm_prob_src_train = get_lm_prob_src(zh_train_src)

# Validation
num_token_src_val = get_num_token(zh_val_src)
lm_prob_src_val = get_lm_prob_src(zh_val_src)

Generate the fluency feature set

In [0]:
# Train 
num_token_mt_train = get_num_token(zh_train_mt)
# Normalise 
num_token_mt_train = (num_token_mt_train - np.min(num_token_mt_train)) / (np.max(num_token_mt_train) - np.min(num_token_mt_train))
lm_prob_mt_train = get_lm_prob_target(zh_train_mt)

# Validation
num_token_mt_val = get_num_token(zh_val_mt)
lm_prob_mt_val = get_lm_prob_target(zh_val_mt)

Generate the adaquecy feature set

In [129]:
# Train 
ratio_token_src_mt_train = get_ratio_num_token(num_token_src_train, num_token_mt_train)
ratio_punc_src_mt_train = get_ratio_punc(pro_zh_train_src, pro_zh_train_mt)

# Validation
ratio_token_src_mt_val = get_ratio_num_token(num_token_src_val, num_token_mt_val)
ratio_punc_src_mt_val = get_ratio_punc(pro_zh_val_src, pro_zh_val_mt)

  


Note: The cell below may take 10 mins to run 

In [0]:
# Train 
ratio_oc_tags_train, ratio_cc_tags_train, ratio_o_tags_train, \
 ratio_tags_fine_train, depth_diff_train =  \
 get_ratio_tags_depend(raw_zh_train_src, raw_zh_train_mt)

# Validation 
ratio_oc_tags_val, ratio_cc_tags_val, ratio_pao_tags_val, \
 ratio_tags_fine_val, depth_diff_val =  \
 get_ratio_tags_depend(raw_zh_val_src, raw_zh_val_mt)

Combine the features to create an single feature vector

In [0]:
def get_feature_vectors(num_token_src, lm_prob_src, 
                        num_token_mt, lm_prob_mt, 
                        ratio_token_src_mt, ratio_punc_src_mt, 
                        ratio_oc_tags, ratio_cc_tags, ratio_o_tags, 
                        ratio_tags_fine, depth_diff):
  feature_vector_list = []
  for i in range(0, len(num_token_src)):
    feature_vector = [
      num_token_src[i],
      lm_prob_src[i],
      num_token_mt[i],
      lm_prob_mt[i],
      ratio_token_src_mt[i],
      ratio_punc_src_mt[i],
      ratio_oc_tags[i],
      ratio_cc_tags[i],
      ratio_o_tags[i],    
      depth_diff[i]          
    ]
    feature_vector.extend(ratio_tags_fine[i])
    feature_vector_list.append(feature_vector)
  return feature_vector_list
  
feature_vector_train = get_feature_vectors(num_token_src_train, lm_prob_src_train, 
                        num_token_mt_train, lm_prob_mt_train, 
                        ratio_token_src_mt_train, ratio_punc_src_mt_train,  
                        ratio_oc_tags_train, ratio_cc_tags_train, ratio_o_tags_train, 
                        ratio_tags_fine_train, depth_diff_train)
                    
feature_vector_val = get_feature_vectors(num_token_src_val, lm_prob_src_val, 
                        num_token_mt_val, lm_prob_mt_val, 
                        ratio_token_src_mt_val, ratio_punc_src_mt_val,  
                        ratio_oc_tags_val, ratio_cc_tags_val, ratio_o_tags_val, 
                        ratio_tags_fine_val, depth_diff_val)

# print(num_token_src_train[0])
# print(lm_prob_src_train[0])

# print(num_token_mt_train[0])
# print(lm_prob_mt_train[0])

# print(ratio_token_src_mt_train[0])
# print(ratio_punc_src_mt_train[0])

# print(ratio_oc_tags_train[0])
# print(ratio_cc_tags_train[0])
# print(ratio_o_tags_train[0])

# print(ratio_tags_fine_train[0])
# print(depth_diff_train[0])

# print(feature_vector_train[0])
# print(feature_vector_val[0])




Create the training and label sets for train and validation

In [0]:
X_train_zh = np.array(feature_vector_train).transpose()
y_train_zh = np.array(zh_train_scores).astype(float)

X_val_zh = np.array(feature_vector_val).transpose()
y_val_zh = np.array(zh_val_scores).astype(float)

In [0]:
import math 

#print(np.where(np.isnan(X_train_zh)))
for count, val in enumerate(X_train_zh):
  if not val.any(): 
    for values in val: 
      if not val: 
        print(values)
    X_train_zh[count] = X_train_zh[0]

for count, val in enumerate(y_train_zh):
  if not val.any(): 
    print(count)


# Support Vector Regression Experiment

In [0]:
import numpy as np

def rmse(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())

In [169]:
from sklearn.svm import SVR
from scipy.stats.stats import pearsonr

for k in ['linear','poly','rbf','sigmoid']:
    clf_t = SVR(kernel=k)
    clf_t.fit(X_train_zh, y_train_zh)
    print(k)
    predictions = clf_t.predict(X_val_zh)
    pearson = pearsonr(y_val_zh, predictions)
    print(f'RMSE: {rmse(predictions,y_val_zh)} Pearson {pearson[0]}')
    print()

ValueError: ignored

Setup input and predicted outputs:

In [0]:


# X_train_100 = [np.array(zh_train_src_100), np.array(zh_train_mt_100)]
# X_train_zh_100 = np.array(X_train_100).transpose()

# X_val_100 = [np.array(zh_val_src_100),np.array(zh_val_mt_100)]
# X_val_zh_100 = np.array(X_val_100).transpose()

# X_train_300 = [np.array(zh_train_src_300), np.array(zh_train_mt_300)]
# X_train_zh_300 = np.array(X_train_300).transpose()

# X_val_300 = [np.array(zh_val_src_300),np.array(zh_val_mt_300)]
# X_val_zh_300 = np.array(X_val_300).transpose()

# #Scores
# train_scores = np.array(zh_train_scores).astype(float)
# y_train_zh = train_scores

# val_scores = np.array(zh_val_scores).astype(float)
# y_val_zh = val_scores

## Methods

**TODO** e.g. SVM, random forest etc

In [0]:
import os
from google.colab import files
from zipfile import ZipFile

def writeScores(scores):
    fn = "predictions.txt"
    print("")
    with open(fn, 'w') as output_file:
        for idx,x in enumerate(scores):
            #out =  metrics[idx]+":"+str("{0:.2f}".format(x))+"\n"
            #print(out)
            output_file.write(f"{x}\n")


def downloadScores(method_name, scores):
  writeScores(scores)
  with ZipFile(f"en-zh_{method_name}.zip", "w") as newzip:
    newzip.write("predictions.txt")
  
  files.download(f"en-zh_{method_name}.zip")