In [None]:
!pip install openai streamlit --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m65.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m53.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
!apt-get update -qq
!apt-get install -y ffmpeg


W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 36 not upgraded.


In [None]:
!apt-get update -qq
!apt-get install -y ffmpeg
!pip install transformers moviepy pillow --quiet

from transformers import pipeline
from moviepy.editor import VideoFileClip, concatenate_videoclips, ImageClip
from PIL import Image, ImageDraw, ImageFont
import os
import string
import numpy as np

generator = pipeline('text-generation', model='gpt2')

def refine_text_local(text):
    prompt = f"Refine this sentence for sign language translation clarity:\n{text}\nRefined:"
    result = generator(prompt, max_length=50, num_return_sequences=1, do_sample=False, truncation=True)
    refined_text = result[0]['generated_text'].split("Refined:")[-1].strip()
    refined_text = refined_text.split("\n")[0]
    return refined_text

def clean_text(text):
    return text.translate(str.maketrans('', '', string.punctuation))

sign_gloss_dict = {
    "help": "HELP",
    "you": "YOU",
    "how": "HOW",
    "can": "CAN",
    "i": "I",
    "today": "TODAY",
    "thank": "THANK",
    "thanks": "THANK",
    "please": "PLEASE",
    "hello": "HELLO",
    "goodbye": "GOODBYE",
    "yes": "YES",
    "no": "NO",
}

def text_to_gloss(text):
    text = clean_text(text)
    words = text.lower().split()
    glosses = [sign_gloss_dict.get(word, word.upper()) for word in words]
    return glosses

def create_dummy_clip_pil(gloss, duration=2, size=(320, 240), bgcolor=(50, 150, 250)):
    img = Image.new('RGB', size, color=bgcolor)
    draw = ImageDraw.Draw(img)

    try:
        font = ImageFont.truetype("DejaVuSans-Bold.ttf", 70)
    except:
        font = ImageFont.load_default()

    text = gloss
    bbox = draw.textbbox((0, 0), text, font=font)
    text_width = bbox[2] - bbox[0]
    text_height = bbox[3] - bbox[1]

    text_x = (size[0] - text_width) / 2
    text_y = (size[1] - text_height) / 2

    draw.text((text_x, text_y), text, font=font, fill=(255, 255, 255))

    np_img = np.array(img)
    clip = ImageClip(np_img).set_duration(duration)

    if not os.path.exists("sign_videos"):
        os.mkdir("sign_videos")
    output_path = f"./sign_videos/{gloss}.mp4"
    clip.write_videofile(output_path, codec="libx264", fps=24, audio=False, verbose=False, logger=None)
    print(f"Dummy clip created: {output_path}")

def create_dummy_clips_for_glosses(glosses):
    for gloss in set(glosses):
        create_dummy_clip_pil(gloss)

def create_sign_video(glosses, output_filename="output_sign_language.mp4"):
    clips = []
    for gloss in glosses:
        video_path = f"./sign_videos/{gloss}.mp4"
        if os.path.exists(video_path):
            clip = VideoFileClip(video_path)
            clips.append(clip)
        else:
            print(f"Warning: Video clip for '{gloss}' not found. Skipping.")
    if not clips:
        print("No video clips found for the glosses.")
        return None
    final_clip = concatenate_videoclips(clips, method="compose")
    final_clip.write_videofile(output_filename, codec="libx264", fps=24, audio=False)
    return output_filename


input_text = "Can you help me today?"
print("Input text:", input_text)

refined = refine_text_local(input_text)
print("Refined text:", refined)

glosses = text_to_gloss(refined)
print("Sign language glosses:", glosses)

create_dummy_clips_for_glosses(glosses)

output_video = create_sign_video(glosses)
if output_video:
    print(f"Video saved to {output_video}")


W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 36 not upgraded.


Device set to use cpu
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=256) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Input text: Can you help me today?
Refined text: Yes, I can.
Sign language glosses: ['YES', 'I', 'CAN']
Dummy clip created: ./sign_videos/CAN.mp4
Dummy clip created: ./sign_videos/I.mp4
Dummy clip created: ./sign_videos/YES.mp4
Moviepy - Building video output_sign_language.mp4.
Moviepy - Writing video output_sign_language.mp4



                                                               

Moviepy - Done !
Moviepy - video ready output_sign_language.mp4
Video saved to output_sign_language.mp4




You're absolutely right — the current output video displays English words like CAN, I, YES, not real sign language visuals. That's because this version only creates dummy video clips with text overlays representing glosses (which are used to label sign language meaning), not the actual signed gestures.

✅ What you're seeing:
Text-based video placeholders using glosses (e.g., YES, I, CAN) to represent signs.

This is useful for prototyping structure but not useful for real communication with Deaf users.

❌ What’s missing:
Real animated sign language (either video snippets of a signer or a generative avatar signing).

The actual signs using hands/facial expressions, not just the gloss text.

✅ What you can do next:
Option 1: Use pre-recorded sign videos
You could:

Download or collect small video clips (e.g., .mp4) for each gloss (e.g., CAN.mp4, YES.mp4).

Replace dummy video generation with loading real sign clips from a folder.

Stitch those real clips together based on glosses.

I can help you rewrite the code for this. Just ask.

Option 2: Use a generative avatar for signing (harder, but scalable)
Tools like SignAll, SignAvatar, or OpenAI's upcoming video models (like Sora) could eventually generate sign videos from text.

There’s also RWTH-PHOENIX-Weather datasets that contain aligned gloss/video pairs — useful for training.

Building this from scratch would require a pose-to-video or gloss-to-sign-avatar model (we can guide this process too).

✅ Immediate Step (Most Practical Right Now):
Let’s go with Option 1: Use real sign videos if your goal is to show actual signs.

Would you like me to:

Update the code to load real sign videos from a folder (one per gloss)?

Provide a directory structure and example logic?

Example:
objectivec
Copy
Edit
sign_videos/
├── YES.mp4
├── I.mp4
├── CAN.mp4
Let me know — I can give you ready-to-run code for that.


In [None]:
!apt-get update -qq
!apt-get install -y ffmpeg
!pip install transformers moviepy pillow nltk --quiet

from transformers import pipeline
from moviepy.editor import VideoFileClip, concatenate_videoclips, ImageClip
from PIL import Image, ImageDraw, ImageFont
import os
import string
import numpy as np
import nltk

nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import wordnet as wn

generator = pipeline('text-generation', model='gpt2')

def refine_text_local(text):
    prompt = f"Refine this sentence for sign language translation clarity:\n{text}\nRefined:"
    result = generator(prompt, max_length=50, num_return_sequences=1, do_sample=False, truncation=True)
    refined_text = result[0]['generated_text'].split("Refined:")[-1].strip()
    refined_text = refined_text.split("\n")[0]
    return refined_text

def clean_text(text):
    return text.translate(str.maketrans('', '', string.punctuation))

sign_gloss_dict = {
    "help": "HELP",
    "you": "YOU",
    "how": "HOW",
    "can": "CAN",
    "i": "I",
    "me": "ME",
    "today": "TODAY",
    "thank": "THANK",
    "thanks": "THANK",
    "please": "PLEASE",
    "hello": "HELLO",
    "goodbye": "GOODBYE",
    "yes": "YES",
    "no": "NO",
}

def get_synonym(word):
    """Get synonym from WordNet if possible."""
    synsets = wn.synsets(word)
    if synsets:
        lemma = synsets[0].lemmas()[0].name()
        return lemma.lower()
    return word

def text_to_gloss(text):
    text = clean_text(text)
    words = text.lower().split()
    glosses = []

    for word in words:
        if word in sign_gloss_dict:
            gloss = sign_gloss_dict[word]
        else:
            synonym = get_synonym(word)
            gloss = sign_gloss_dict.get(synonym, word.upper())
        glosses.append(gloss)

    return glosses

def create_dummy_clip_pil(gloss, duration=2, size=(320, 240), bgcolor=(50, 150, 250)):
    img = Image.new('RGB', size, color=bgcolor)
    draw = ImageDraw.Draw(img)

    try:
        font = ImageFont.truetype("DejaVuSans-Bold.ttf", 70)
    except:
        font = ImageFont.load_default()

    bbox = draw.textbbox((0, 0), gloss, font=font)
    text_x = (size[0] - (bbox[2] - bbox[0])) / 2
    text_y = (size[1] - (bbox[3] - bbox[1])) / 2
    draw.text((text_x, text_y), gloss, font=font, fill=(255, 255, 255))

    np_img = np.array(img)
    clip = ImageClip(np_img).set_duration(duration)

    if not os.path.exists("sign_videos"):
        os.mkdir("sign_videos")

    output_path = f"./sign_videos/{gloss}.mp4"
    clip.write_videofile(output_path, codec="libx264", fps=24, audio=False, verbose=False, logger=None)
    print(f"Dummy clip created: {output_path}")

def create_dummy_clips_for_glosses(glosses):
    for gloss in set(glosses):
        create_dummy_clip_pil(gloss)

def create_sign_video(glosses, output_filename="output_sign_language.mp4"):
    clips = []
    for gloss in glosses:
        video_path = f"./sign_videos/{gloss}.mp4"
        if os.path.exists(video_path):
            clips.append(VideoFileClip(video_path))
        else:
            print(f"Warning: Video for '{gloss}' not found.")
    if not clips:
        print("No clips created.")
        return None
    final = concatenate_videoclips(clips, method="compose")
    final.write_videofile(output_filename, codec="libx264", fps=24, audio=False)
    return output_filename


W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 36 not upgraded.


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
Device set to use cpu


In [None]:
user_input = input("Enter any sentence: ")
print("User input:", user_input)

refined = refine_text_local(user_input)
print("Refined text:", refined)

glosses = text_to_gloss(refined)
print("Glosses:", glosses)

create_dummy_clips_for_glosses(glosses)

output_video = create_sign_video(glosses)
if output_video:
    print(f"Final video saved to {output_video}")


Enter any sentence: What's your guilty pleasure?


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=256) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


User input: What's your guilty pleasure?
Refined text: The sentence is:
Glosses: ['THE', 'SENTENCE', 'IS']
Dummy clip created: ./sign_videos/THE.mp4
Dummy clip created: ./sign_videos/IS.mp4
Dummy clip created: ./sign_videos/SENTENCE.mp4
Moviepy - Building video output_sign_language.mp4.
Moviepy - Writing video output_sign_language.mp4



                                                               

Moviepy - Done !
Moviepy - video ready output_sign_language.mp4
Final video saved to output_sign_language.mp4




Converts any user sentence into a simulated sign language video using simplified "glosses."

🔧 Steps:
Refine Text
→ Uses GPT-2 to simplify sentence for sign clarity.

Convert to Glosses
→ Translates each word into a sign language gloss (e.g., “thank you” → “THANK YOU”).

Create Dummy Video Clips
→ Generates a short video showing each gloss word (text-only).

Combine Clips
→ Stitches all gloss clips into one final video.

In [None]:
!apt-get update -qq
!apt-get install -y ffmpeg
!pip install transformers moviepy pillow nltk --quiet

from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
from moviepy.editor import VideoFileClip, concatenate_videoclips, ImageClip
from PIL import Image, ImageDraw, ImageFont
import os
import string
import numpy as np
import nltk

nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import wordnet as wn

generator = pipeline('text-generation', model='gpt2')
tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-small")
model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-small")


def refine_text_local(text):
    prompt = f"Refine this sentence for sign language translation clarity:\n{text}\nRefined:"
    result = generator(prompt, max_length=50, num_return_sequences=1, do_sample=False, truncation=True)
    refined_text = result[0]['generated_text'].split("Refined:")[-1].strip()
    refined_text = refined_text.split("\n")[0]
    return refined_text

def generate_response(user_input):
    input_ids = tokenizer.encode(user_input + tokenizer.eos_token, return_tensors="pt")
    output_ids = model.generate(input_ids, max_length=100, pad_token_id=tokenizer.eos_token_id)
    response = tokenizer.decode(output_ids[:, input_ids.shape[-1]:][0], skip_special_tokens=True)
    return response

def clean_text(text):
    return text.translate(str.maketrans('', '', string.punctuation))

sign_gloss_dict = {
    "help": "HELP",
    "you": "YOU",
    "how": "HOW",
    "can": "CAN",
    "i": "I",
    "me": "ME",
    "today": "TODAY",
    "thank": "THANK",
    "thanks": "THANK",
    "please": "PLEASE",
    "hello": "HELLO",
    "goodbye": "GOODBYE",
    "yes": "YES",
    "no": "NO",
    "sure": "SURE",
    "can": "CAN",
    "do": "DO",
}

def get_synonym(word):
    synsets = wn.synsets(word)
    if synsets:
        lemma = synsets[0].lemmas()[0].name()
        return lemma.lower()
    return word

def text_to_gloss(text):
    text = clean_text(text)
    words = text.lower().split()
    glosses = []

    for word in words:
        if word in sign_gloss_dict:
            gloss = sign_gloss_dict[word]
        else:
            synonym = get_synonym(word)
            gloss = sign_gloss_dict.get(synonym, word.upper())
        glosses.append(gloss)

    return glosses

def create_dummy_clip_pil(gloss, duration=2, size=(320, 240), bgcolor=(50, 150, 250)):
    img = Image.new('RGB', size, color=bgcolor)
    draw = ImageDraw.Draw(img)

    try:
        font = ImageFont.truetype("DejaVuSans-Bold.ttf", 70)
    except:
        font = ImageFont.load_default()

    bbox = draw.textbbox((0, 0), gloss, font=font)
    text_x = (size[0] - (bbox[2] - bbox[0])) / 2
    text_y = (size[1] - (bbox[3] - bbox[1])) / 2
    draw.text((text_x, text_y), gloss, font=font, fill=(255, 255, 255))

    np_img = np.array(img)
    clip = ImageClip(np_img).set_duration(duration)

    if not os.path.exists("sign_videos"):
        os.mkdir("sign_videos")

    output_path = f"./sign_videos/{gloss}.mp4"
    clip.write_videofile(output_path, codec="libx264", fps=24, audio=False, verbose=False, logger=None)
    print(f"Dummy clip created: {output_path}")

def create_dummy_clips_for_glosses(glosses):
    for gloss in set(glosses):
        create_dummy_clip_pil(gloss)

def create_sign_video(glosses, output_filename="output_sign_language.mp4"):
    clips = []
    for gloss in glosses:
        video_path = f"./sign_videos/{gloss}.mp4"
        if os.path.exists(video_path):
            clips.append(VideoFileClip(video_path))
        else:
            print(f"Warning: Video clip for '{gloss}' not found.")
    if not clips:
        print("No video clips found.")
        return None
    final_clip = concatenate_videoclips(clips, method="compose")
    final_clip.write_videofile(output_filename, codec="libx264", fps=24, audio=False)
    return output_filename


user_input = input("Enter any sentence: ")
print("User input:", user_input)

refined_input = refine_text_local(user_input)
print("Refined input for sign clarity:", refined_input)

response = generate_response(user_input)
print("AI Response:", response)

refined_response = refine_text_local(response)
print("Refined response:", refined_response)

glosses = text_to_gloss(refined_response)
print("Sign language glosses:", glosses)

create_dummy_clips_for_glosses(glosses)

output_video = create_sign_video(glosses)
if output_video:
    print(f"Final video saved to: {output_video}")


W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 36 not upgraded.


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
Device set to use cpu


tokenizer_config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/641 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/351M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Enter any sentence: "What's something you're really good at


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=256) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


User input: "What's something you're really good at


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Refined input for sign clarity: "What's something you're really good


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=256) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


AI Response: What's a good trick?
Refined response: The word "refined" is a word that is used to describe a change in the meaning of a word. It is used to describe a change in the meaning of a word.
Sign language glosses: ['THE', 'WORD', 'REFINED', 'IS', 'A', 'WORD', 'THAT', 'IS', 'USED', 'TO', 'DESCRIBE', 'A', 'CHANGE', 'IN', 'THE', 'MEANING', 'OF', 'A', 'WORD', 'IT', 'IS', 'USED', 'TO', 'DESCRIBE', 'A', 'CHANGE', 'IN', 'THE', 'MEANING', 'OF', 'A', 'WORD']
Dummy clip created: ./sign_videos/IN.mp4
Dummy clip created: ./sign_videos/WORD.mp4
Dummy clip created: ./sign_videos/TO.mp4
Dummy clip created: ./sign_videos/IS.mp4
Dummy clip created: ./sign_videos/A.mp4
Dummy clip created: ./sign_videos/USED.mp4
Dummy clip created: ./sign_videos/MEANING.mp4
Dummy clip created: ./sign_videos/THAT.mp4
Dummy clip created: ./sign_videos/OF.mp4
Dummy clip created: ./sign_videos/REFINED.mp4
Dummy clip created: ./sign_videos/CHANGE.mp4
Dummy clip created: ./sign_videos/THE.mp4
Dummy clip created: ./sign

                                                                 

Moviepy - Done !
Moviepy - video ready output_sign_language.mp4
Final video saved to: output_sign_language.mp4




User enters a question (e.g., "Can you help me today?")

AI generates a response (e.g., "Sure, I can help you.")

Response is simplified and glossed (e.g., SURE I HELP YOU)

Dummy video clips are created and stitched together into a sign-style response video.
