#### Connecting to the Drive

In [None]:
#Mount the Drive
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
%cd /content/gdrive/My Drive/Projects/MWP

#### Installing requirments

In [None]:
!pip install -q git+https://github.com/huggingface/transformers.git

In [None]:
!pip install -r 'MWP/requirements.txt'

##### Parameters and directories

In [None]:
import os
from pathlib import Path

In [None]:
working_dir = "/content/gdrive/My Drive/Projects/MWP/"
data_dir = os.path.join(working_dir, "data")

#### Prepare dataset

In [None]:
def generate_seeded_dataset(input_file, output_file, ratio):
    with open(input_file, 'r') as f_target, open(output_file, 'w') as f_source:
        for target_line in f_target:
            source_line = target_line.strip().split()
            source_line = " ".join(source_line[:round(len(source_line) * ratio)])
            f_source.write('%s\n' % source_line)

In [None]:

experiment= "A"
mwp_type = "simple"   #simple or algebraic
language = "en"
seed =  0.5

# Create directory to store processed data
Path(os.path.join(data_dir, language, experiment, str(seed))).mkdir(parents=True, exist_ok=True)

datain_file = "{}_{}.txt".format(mwp_type, language)
dataout_file = "target_{}_{}_{}.txt".format(experiment, seed, language)
input_file = os.path.join(data_dir, datain_file)
output_file = os.path.join(data_dir, language, experiment, str(seed), dataout_file)
generate_seeded_dataset(input_file, output_file, seed)

#### Finetune the T5 model for MWP

In [None]:
!python 'MWP/train.py'

#### Load saved weights 

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

tokenizer = T5Tokenizer.from_pretrained(working_dir + "save_weights/path_to_model/")
model = T5ForConditionalGeneration.from_pretrained(working_dir + "save_weights/path_to_model/")

In [None]:
text = "input text"

# Tokenize the input string
input_ids = tokenizer.encode(text, max_length=50, padding="max_length", return_tensors="pt", truncation=True)

# Print the output vector
greedy_output = model.generate(input_ids,max_length=None, min_length=None)

print("Output sentence:\n" + 100 * '-')
print(tokenizer.decode(greedy_output[0], skip_special_tokens=True))

#Data split and save the text as .source and .target files

In [None]:
import pandas as pd

In [None]:
%cd '/content/0.25'

In [None]:
%ls

In [None]:
from sklearn.model_selection import train_test_split
import numpy as np

def split(file1,file2,train_size=0.8,test_size=0.1):
  test_size = test_size/(1-train_size)
  original,seed = [],[]
  original = open(file1).read().splitlines()
  seed = open(file2).read().splitlines()

  ori_df = pd.DataFrame(original,columns=["Original"])
  seed_df = pd.DataFrame(seed,columns=["Seed"])

  X_train, X_rem, y_train, y_rem = train_test_split(ori_df,seed_df, train_size=train_size, random_state=1)
  X_valid, X_test, y_valid, y_test = train_test_split(X_rem,y_rem, test_size=test_size,random_state=1)

  print(X_train.shape), print(y_train.shape)
  print(X_valid.shape), print(y_valid.shape)
  print(X_test.shape), print(y_test.shape)

  np.savetxt(r'train.target', X_train.values, fmt='%s')
  np.savetxt(r'train.source', y_train.values, fmt='%s')

  np.savetxt(r'val.target', X_valid.values, fmt='%s')
  np.savetxt(r'val.source', y_valid.values, fmt='%s')

  np.savetxt(r'test.target', X_test.values, fmt='%s')
  np.savetxt(r'test.source', y_test.values, fmt='%s')

In [None]:
# def seed_split(file1,file2,seed_size=0.5):
#   original = open(file1,'w')
#   seed = open(file2,'w')

#   for index,row in english.iterrows():
#     original.write(row[0]+'\n')

#     lst = row[0].split(' ')
#     size = round(int(len(lst)) * seed_size)

#     new_lst = []
#     for p in range(size):
#       new_lst.append(lst[p])
      
#     combined_txt = ' '.join(new_lst)
#     final_text = combined_txt +'\n'
    
#     seed.write(final_text)

#   original.close()
#   seed.close()

In [None]:
# seed_split('A-long.txt','A-seed.txt',0.5)

In [None]:
#need to give the input as text file
split('/content/TC-long.txt','/content/TC-0.25 seed.txt',0.4,0.5)