In [1]:
from datasets import load_dataset
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import torch
from torch import optim
from torch.nn import functional as F
from transformers import AdamW, AutoModelForSeq2SeqLM, AutoTokenizer
from transformers import get_linear_schedule_with_warmup
from tqdm import tqdm_notebook

sns.set()

ImportError: cannot import name 'load_dataset' from 'datasets' (C:\Users\Admin\anaconda3\lib\site-packages\datasets\__init__.py)

In [2]:
model_repo = 'google/mt5-base'
model_path = 'mt5_translation.pt'
max_seq_len = 20

In [3]:
tokenizer = AutoTokenizer.from_pretrained(model_repo)



In [4]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_repo)

In [5]:
LANG_TOKEN_MAPPING = {
    'en': '<en>',
    'fi': '<fi>'
}

In [6]:
special_tokens_dict = {'additional_special_tokens': list(LANG_TOKEN_MAPPING.values())}
tokenizer.add_special_tokens(special_tokens_dict)
model.resize_token_embeddings(len(tokenizer))

Embedding(250102, 768)

In [7]:
def encode_input_str(text, target_lang, tokenizer, seq_len,
                     lang_token_map=LANG_TOKEN_MAPPING):
  target_lang_token = lang_token_map[target_lang]

  # Tokenize and add special tokens
  input_ids = tokenizer.encode(
      text = target_lang_token + text,
      return_tensors = 'pt',
      padding = 'max_length',
      truncation = True,
      max_length = seq_len)

  return input_ids[0]
  


In [8]:
model.load_state_dict(torch.load(model_path,map_location = torch.device('cpu')))

<All keys matched successfully>

In [9]:
def translator(input_text1,output_language1):
    input_ids = encode_input_str(
    text = input_text1,
    target_lang = output_language1,
    tokenizer = tokenizer,
    seq_len = model.config.max_length,
    lang_token_map = LANG_TOKEN_MAPPING)
    input_ids = input_ids.unsqueeze(0)

    output_tokens = model.generate(input_ids, num_beams=20, length_penalty=0.2)
    print("TRANSLATION:")
    print(input_text1 + '  ->  ' + tokenizer.decode(output_tokens[0], skip_special_tokens=True))

In [10]:
import cv2
from PIL import Image
import pytesseract
import matplotlib.pyplot as plt
import numpy as np
import sys

In [11]:
def display(im_path):
    dpi = 80
    im_data = plt.imread(im_path)

    height, width  = im_data.shape[:2]
    
    # What size does the figure need to be in inches to fit the image?
    figsize = width / float(dpi), height / float(dpi)

    # Create a figure of the right size with one axes that takes up the full figure
    fig = plt.figure(figsize=figsize)
    ax = fig.add_axes([0, 0, 1, 1])

    # Hide spines, ticks, etc.
    ax.axis('off')

    # Display the image.
    ax.imshow(im_data, cmap='gray')

    plt.show()


In [12]:
def remove_borders(image):
    newImage = image.copy()
    gray = cv2.cvtColor(newImage, cv2.COLOR_BGR2GRAY)
    contours, heiarchy = cv2.findContours(gray, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    cntsSorted = sorted(contours, key=lambda x:cv2.contourArea(x))
    cnt = cntsSorted[-1]
    x, y, w, h = cv2.boundingRect(cnt)
    crop = image[y:y+h, x:x+w]
    return (crop)

In [13]:
import speech_recognition as sr

In [14]:
def audio_to_text(audiopath1,output_language1):
    r = sr.Recognizer()
    with sr.AudioFile(audiopath1) as source:
        audio_data = r.record(source)
        text = r.recognize_google(audio_data,language=output_language1)
        return(text)

In [17]:
choice = input("Enter 1 for image, 2 for text or 3 for audio ")

it_is = True
try:
    int(choice)
    it_is = True
except ValueError:
    it_is = False
    
if it_is:
    choice = int(choice)
    
    # For image input
    if(choice==1):
        imagepath = input("Enter image path(eg. 'IMG.jpg'): ")
        
        print("What is the output language?")
        output_language = input("fi for finnish or en for english: ")

        if(output_language!='fi' and output_language!='en'):
            print("Please enter only en or fi!")
            print(sys.exit())
            
        else:
            img = cv2.imread(imagepath)
            no_borders = remove_borders(img)
            
            if(output_language=='en'):
                input_text = pytesseract.image_to_string(img,lang="fin")
            else:
                input_text = pytesseract.image_to_string(img)
                
            input_text = input_text+'.'
            display(imagepath)
            translator(input_text,output_language)
            
    # For text input
    elif(choice==2):
        input_text = input("Enter sentence (less than 20 words): ")
        input_text = input_text+'.'
        
        print("What is the output language?")
        output_language = input("fi for finnish or en for english: ")

        if(output_language!='fi' and output_language!='en'):
            print("Please enter only en or fi!")
            print(sys.exit())
        else:
            translator(input_text,output_language)
            
    # For audio input
    elif(choice==3):
        audiopath = input("Enter audio path(eg. 'audio.wav'): ")
        
        print("What is the output language?")
        output_language = input("fi for finnish or en for english: ")

        if(output_language!='fi' and output_language!='en'):
            print("Please enter only en or fi!")
            print(sys.exit())
            
        else:
            
            if(output_language=='en'):
                input_text = audio_to_text(audiopath, 'fi')
            else:
                input_text = audio_to_text(audiopath,'en-GB')
            input_text = input_text+'.'
            translator(input_text,output_language)
            
    else:
        print("Please enter 1,2 or 3 only!")
else:
    print("Not a number!")

Enter 1 for image, 2 for text or 3 for audio 2
Enter sentence (less than 20 words): Olette typerä
What is the output language?
fi for finnish or en for english: en
TRANSLATION:
Olette typerä.  ->  You are stupid.
