<a href="https://colab.research.google.com/github/TranQuocViet236/Somethings_on_Colab/blob/main/Detect_language_by_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

#Import Keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

import re
from distutils.version import LooseVersion
import warnings
import tensorflow as tf
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split
from sklearn.preprocessing import OneHotEncoder

import numpy as np
import pickle
import sys



Remove all of special characters, 
converting to lower case

In [None]:

#Hyperparameters:
max_sentence_length = 200
embedding_vector_length = 300
dropout = 0.5
sentence = "   I love) you?  "
def process_sentence(sentence):
  new_sentence = sentence.lower()
  return re.sub(r'[\\\\/:*«`\'?¿";!<>,.|]]','', new_sentence.strip())


Create a table to look up Vocab

In [None]:
def create_lookup_table(text):
  #parameter text will be devided into words
  #return: (vocab_to_int, int_to_vocab)

  vocab = set(text)

  vocab_to_int = {word: i for i, word in enumerate(vocab)}
  int_to_vocab = {v: k for k,v in vocab_to_int.items()}
  return (vocab_to_int, int_to_vocab)

In [None]:
# print(create_lookup_table('Anh yeu em nhieu lam'))
# #Convert text into number
def convert_to_int(data, data_int):

  all_items = []
  for sentence in data:
    all_items.append([data_int[word] if word in data_int else data_int["unk"] for word in sentence.split()])
  return all_items

      

In [None]:
#Load data from file
def load_data(file_path):
  data = pd.read_csv(file_path, names = ['sentence', 'language'], header = None)
  print(data.describe())
  return data


In [None]:
#Building model
def get_model():
  model = Sequential()

  model.add(Embedding(len(vocab_to_int), embedding_vector_length, input_length=max_sentence_length))
  model.add(LSTM(256, return_sequences = True, dropout=dropout, recurrent_dropout=dropout ))
  model.add(LSTM(256, dropout=dropout, recurrent_dropout= dropout))
  model.add(Dense(len(languages), activation='softmax'))
  model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['acc'])

  return model

In [None]:
#Convert an input sentence to integer vector and predict base on model
def predict_sentence(model, sentence, vocab_to_int, idx_to_language):
  #Clean the sentence
  sentence = process_sentence(sentence)
  sen_encode = convert_to_int(sentence, vocab_to_int)
  # enc = OneHotEncoder()
  # X = enc.fit_transform(convert_to_int(sentence, languages_to_int)).toarray()

  #Transform and pad it before using the model to predict
  # X = np.array(convert_to_int([sentence], vocab_to_int))
  X = sequence.pad_senquences(sen_encode, maxlen= max_sentence_length)
  #pad_senquence is used to convert a matrix into another matrix has the same length and is max_sentence_length

  prediction = model.predict(X)

  #Get the highest prediction
  lang_index = np.argmax(prediction)
  print(prediction[0][lang_index])

  # If the probality < 0.3, cannot determine the language

  if prediction[0][lang_index] < 0.3:
    return 'Unknown'
  else:
    return idx_to_language[lang_index]
    




In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
#load and shuffle dataframe
filepath = '/content/gdrive/MyDrive/ML/LSTM/language_data.csv'
data = load_data(filepath)

sss = StratifiedShuffleSplit(test_size = 0.2, random_state=0)


In [None]:
#Process sentences:
X = data['sentence'].apply(process_sentence)
y = data['language']

In [None]:
#Divide data into sentences
elements = (' '.join([sentence for sentence in X])).split()

In [None]:
X_train, X_test, y_train, y_test = None, None, None, None

In [None]:
print(sss.split(X,y))

In [None]:
for train_index, test_index in sss.split(X,y):
  X_train, X_test = X[train_index], X[test_index]
  y_train, y_test = y[train_index], y[test_index]

languages = set(y)
elements.append('unk')

In [None]:
languages
elements

In [None]:
(vocab_to_int, int_to_vocab) = create_lookup_table(elements)
(languages_to_int, int_to_languages) = create_lookup_table(y)

In [None]:
print(vocab_to_int)
print(int_to_vocab)
print(languages_to_int)
print(int_to_languages)

In [None]:
X_train_encode = convert_to_int(X_train, vocab_to_int)
X_test_encode = convert_to_int(X_test, vocab_to_int)

y_data = convert_to_int(y_test, languages_to_int)

In [None]:
enc = OneHotEncoder()
y_train_encode = enc.fit_transform(convert_to_int(y_train, languages_to_int)).toarray()
y_test_encode = enc.fit_transform(convert_to_int(y_test, languages_to_int)).toarray()

In [None]:

X_train_pad = sequence.pad_sequences(X_train_encode,maxlen= max_sentence_length)
X_test_pad = sequence.pad_sequences(X_test_encode, maxlen= max_sentence_length)


In [None]:
print(X_train_pad)
print(X_test_pad)

In [None]:
model = get_model()
model.summary()

In [None]:
model.fit(X_train_pad, y_train_encode, epochs=5, batch_size=256)

In [None]:
# Danh gia model
scores = model.evaluate(X_test_pad, y_test_encode, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

# Luu Model vao file
model.save("model.h5")
print("Model trained and saved!")

In [None]:
model = model.load_weights("model.h5")
print("Model loaded!")

In [None]:
# predict_sentence(model, "I love you", vocab_to_int, languages_to_int)