# 1. Import and Install Dependencies

In [None]:
!pip install psycopg2-binary
!pip install torch torchvision torchaudio
!pip install sentence-transformers psycopg2-binary faiss-cpu
!pip install language_tool_python
!pip install spacy
!python -m spacy download en_core_web_sm
!pip install imageio
!pip install python-dotenv


In [None]:
import psycopg2
from sentence_transformers import SentenceTransformer
import torch
import numpy as np
import faiss
import language_tool_python
import spacy
import os
from moviepy.editor import VideoFileClip, concatenate_videoclips
from dotenv import load_dotenv

# 2. Storing Word Embeddings using PostgreSQL database

In [None]:
ROOT = '.'

In [None]:
# Load environment variables from .env
load_dotenv()

# Accessing individual variables
db_host = os.getenv("DB_HOST")
db_name = os.getenv("DB_NAME")
db_user = os.getenv("DB_USER")
db_password = os.getenv("DB_PASSWORD")
db_port = os.getenv("DB_PORT")

# Set seeds for reproducibility
seed = 42
torch.manual_seed(seed)
np.random.seed(seed)

# Establish the connection
conn = psycopg2.connect(
    host=db_host,
    dbname=db_name,
    user=db_user,
    password=db_password,
    port=db_port
)
cur = conn.cursor()

# Create the database table if not exists
create_table_query = '''
CREATE TABLE IF NOT EXISTS word_embeddings (
    id SERIAL PRIMARY KEY,
    word TEXT NOT NULL,
    embedding FLOAT8[]
);
'''
cur.execute(create_table_query)
conn.commit()

In [None]:
# Open the file in read mode
with open(os.path.join(ROOT, 'words.txt'), 'r') as file:
    # Read all lines and strip the newline character
    words = [line.strip() for line in file]

# Print the words to verify
print(len(words))


2000


In [None]:
# Load a pre-trained sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for the words
embeddings = model.encode(words)

In [None]:
# Insert words and their embeddings into the database
for word, embedding in zip(words, embeddings):
    # Convert embedding to a Python list of floats
    embedding_list = embedding.tolist()

    cur.execute(
        "INSERT INTO word_embeddings (word, embedding) VALUES (%s, %s)",
        (word, embedding_list)
    )

# 3. Convert Text to Sign Videos

## 3.1 Grammer Correction

In [None]:
def correct_english_grammar(text):
    tool = language_tool_python.LanguageTool('en-US')
    matches = tool.check(text)
    corrected_text = language_tool_python.utils.correct(text, matches)
    return corrected_text


In [None]:
text = "I lives in Taxiss"
corrected_text = correct_english_grammar(text)
print(corrected_text)


Downloading LanguageTool 6.4: 100%|██████████| 246M/246M [01:12<00:00, 3.39MB/s] 
Unzipping C:\Users\islam\AppData\Local\Temp\tmpgj8wsyy2.zip to C:\Users\islam\.cache\language_tool_python.
Downloaded https://www.languagetool.org/download/LanguageTool-6.4.zip to C:\Users\islam\.cache\language_tool_python.


I live in Taxis


## 3.2 Lemmatization

In [None]:
def get_lemmatized_words(sentence):
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(sentence)
    lemmas = [token.lemma_ for token in doc if token.pos_ not in ['AUX', 'ADP', 'SYM']]
    return lemmas


## 3.3 Semantic Search

In [None]:
def get_words_and_embeddings_from_db():
    cur.execute("SELECT word, embedding FROM word_embeddings")
    records = cur.fetchall()
    words, embeddings = zip(*[(record[0], np.array(record[1])) for record in records])
    return words, embeddings

In [None]:
def semantic_search(query, top_k=5):
    query_embedding = model.encode([query])
    words, embeddings = get_words_and_embeddings_from_db()
    embeddings = np.array(embeddings)

    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    distances, indices = index.search(query_embedding, top_k)

    similarities = 1 - distances / 2

    results = []
    threshold = 0.75


    print(f"Semantic Results of {query} :")
    for sim, idx in zip(similarities[0], indices[0]):
        if sim >= threshold:
            print(words[idx])
            results.append(words[idx])
    if not results:
        return [list(query.upper())]
    return results


In [None]:
def semantic_search_multiword_glosses(lemmas):
    tokens = []
    for lemma in lemmas:
        results = semantic_search(lemma)
        tokens.append(results[0])
    return tokens


## 3.4 Videos Concatenation

In [None]:
def get_video_paths(tokens):
    VIDEOS_PATH = os.path.join(ROOT, 'videos')
    LETTERS_PATH = os.path.join(VIDEOS_PATH, 'video_letters')
    video_paths = []
    for token in tokens:
        paths_to_check = []
        if isinstance(token, list):
            paths_to_check = [os.path.join(LETTERS_PATH, letter + '.mp4') for letter in token]
        else:
            paths_to_check = [os.path.join(VIDEOS_PATH, token + '.mp4')]

        for path in paths_to_check:
            if os.path.exists(path):
                video_paths.append(path)
            else:
                print("Path not found at", path)
    return video_paths

In [None]:
def concatenate_videos(video_paths, output_path):
    video_clips = []
    for video_path in video_paths:
        clip = VideoFileClip(video_path)
        video_clips.append(clip)

    concatenated_clip = concatenate_videoclips(video_clips)
    concatenated_clip.write_videofile(output_path, codec='libx264')



# Conversion Pipeline

In [None]:
sentence = 'I went to school yesterday'
output_path = os.path.join(ROOT, 'concatenated.mp4')

corrected_sentence = correct_english_grammar(sentence)
print("corrected: ", corrected_sentence)

lemmas = get_lemmatized_words(corrected_sentence)
print("lemmatized: ", lemmas)

glosses = semantic_search_multiword_glosses(lemmas)
print("glosses: ", glosses)

gif_paths = get_video_paths(glosses)
concatenate_videos(gif_paths, output_path)

# Close Database Connection

In [None]:
# Close the connection
cur.close()
conn.close()