In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
file_path = "/content/jpn.txt"

with open(file_path, 'r', encoding='utf-8') as file:
    lines = file.readlines()


data = [line.strip().split('\t') for line in lines]

# Create DataFrame
df = pd.DataFrame(data, columns=['source', 'target','col3'])

In [None]:
df.shape

(55639, 3)

In [None]:
df

Unnamed: 0,source,target,col3
0,Go.,行け。,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
1,Go.,行きなさい。,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
2,Hi.,こんにちは。,CC-BY 2.0 (France) Attribution: tatoeba.org #5...
3,Hi.,もしもし。,CC-BY 2.0 (France) Attribution: tatoeba.org #5...
4,Hi.,やっほー。,CC-BY 2.0 (France) Attribution: tatoeba.org #5...
...,...,...,...
55634,The bus now arriving is going to Domestic Term...,ただ今到着のバスは、国内線第1ターミナル行きです。国際線ターミナルにお越しの方は、しばらくそ...,CC-BY 2.0 (France) Attribution: tatoeba.org #4...
55635,A child who is a native speaker usually knows ...,ネイティブの子どもは、何年も学んだ非ネイティブが知らず今後も知り得ないたくさんのことを自身の...,CC-BY 2.0 (France) Attribution: tatoeba.org #9...
55636,"I do many things at the same time, so not only...",色々並行してやってるから芥川ばかり読んでるのでもないのだよ。今は英語読んでる時間が増えてる。...,CC-BY 2.0 (France) Attribution: tatoeba.org #1...
55637,The small crowd at Hiroshima Peace Memorial Pa...,アメリカ軍用機エノラ・ゲイから「リトルボーイ」と名付けられた原子爆弾が投下された午前８時１５...,CC-BY 2.0 (France) Attribution: tatoeba.org #8...


In [None]:
#df=df.sample(20000)

In [None]:
source_text=df['source'].tolist()
target_text=df['target'].tolist()

In [None]:
source_tokenizer=Tokenizer()
source_tokenizer.fit_on_texts(source_text)

In [None]:
source_sequences=source_tokenizer.texts_to_sequences(source_text)
source_vocab_size=len(source_tokenizer.word_index)+1

In [None]:
target_tokenizer = Tokenizer()
target_tokenizer.fit_on_texts(target_text)
target_sequences = target_tokenizer.texts_to_sequences(target_text)
target_vocab_size = len(target_tokenizer.word_index) + 1

In [None]:
max_source_length = max(len(seq) for seq in source_sequences)
max_target_length = max(len(seq) for seq in target_sequences)

source_sequences_padded = pad_sequences(source_sequences, maxlen=max_source_length, padding='post')
target_sequences_padded = pad_sequences(target_sequences, maxlen=max_target_length, padding='post')

In [None]:
source_train, source_val, target_train, target_val = train_test_split(source_sequences_padded, target_sequences_padded, test_size=0.2, random_state=42)

In [None]:
embedding_dim=256
units=100

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(source_vocab_size, embedding_dim, input_length=max_source_length, mask_zero=True),
    tf.keras.layers.LSTM(units),
    tf.keras.layers.RepeatVector(max_target_length),
    tf.keras.layers.LSTM(units, return_sequences=True),
    tf.keras.layers.Dense(target_vocab_size, activation='softmax')
])

In [None]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [None]:
model.fit(source_train,target_train,epochs=10,validation_data=(source_val,target_val))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7b578b9fce50>

In [None]:
# Example English sentence to translate
input_sentence = "import tkinter as tk
from tkinter import ttk
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

# Load your machine translation model
model = load_model('your_translation_model.h5')

# Sample tokenizer (you should use the same tokenizer used during training)
tokenizer = Tokenizer()
tokenizer.word_index = {'<pad>': 0, 'hello': 1, 'world': 2}  # Replace with your actual word index

def translate_sentence(input_sentence):
    # Tokenize and pad the input sentence
    input_sequence = tokenizer.texts_to_sequences([input_sentence])
    padded_sequence = pad_sequences(input_sequence, padding='post')

    # Model prediction
    predicted_sequence = model.predict(padded_sequence)

    # Decode the predicted sequence (replace with your decoding logic)
    translated_sentence = " ".join([reverse_word_index.get(idx, '') for idx in predicted_sequence[0]])

    return translated_sentence

def on_translate_button_click():
    input_text = input_entry.get()
    translated_text = translate_sentence(input_text)
    output_label.config(text=f"Translated: {translated_text}")

# Create the main window
window = tk.Tk()
window.title("Translation GUI")

# Input entry
input_entry = ttk.Entry(window, width=40)
input_entry.pack(pady=10)

# Translate button
translate_button = ttk"

# Tokenize and pad the input sentence
input_sequence = source_tokenizer.texts_to_sequences([input_sentence])
input_sequence_padded = pad_sequences(input_sequence, maxlen=max_source_length, padding='post')

In [None]:


input_sentence="Go"

# Tokenize and pad the input sentence
input_sequence = source_tokenizer.texts_to_sequences([input_sentence])
input_sequence_padded = pad_sequences(input_sequence, maxlen=max_source_length, padding='post')

In [None]:
# Make prediction
predicted_sequence = model.predict(input_sequence_padded)

# Get the index of the most probable word for each position in the sequence
predicted_sequence_indices = [np.argmax(token_probs) for token_probs in predicted_sequence[0]]

# Convert indices back to words using the target tokenizer
predicted_words = [word for word, index in target_tokenizer.word_index.items() if index in predicted_sequence_indices]

# Join the predicted words to form the translated sentence
translated_sentence = ' '.join(predicted_words)



In [None]:
translated_sentence

'すごい！'

In [None]:
model=model.save("Transalter.h5")

  saving_api.save_model(
