<a href="https://colab.research.google.com/github/Tullsokk/text-to-movie/blob/main/Text_to_movie.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Welcome to text-to-movie

This notebook allows you to make an AI generated movie from a single prompt. Make sure to change Runtime to GPU. Free tier should give enough juice for a movie or two, after that you might have to pay for extra compute units. You need API tokes for:

*   openAI
*   elevenlabs

Do not share your tokens! The final movie is saved under contents, and can be download, or you can mount your google drive and save it there

In [None]:
#@title Install dependencies
#Installing imagemagick
!apt install imagemagick

!sudo apt-get install libportaudio2
#Installing pip packages
!pip install openai elevenlabs diffusers transformers accelerate pytorch-lightning git+https://github.com/huggingface/diffusers

#You may have to remove or comment out the line   <policy domain="path" rights="none" pattern="@*"/>
# in the file /etc/ImageMagick-6/policy.xml for subtitles to work

#removes a path policy from imagemagick that causes problems

xml_string = """<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE policymap [
  <!ELEMENT policymap (policy)+>
  <!ATTLIST policymap xmlns CDATA #FIXED ''>
  <!ELEMENT policy EMPTY>
  <!ATTLIST policy xmlns CDATA #FIXED '' domain NMTOKEN #REQUIRED
    name NMTOKEN #IMPLIED pattern CDATA #IMPLIED rights NMTOKEN #IMPLIED
    stealth NMTOKEN #IMPLIED value CDATA #IMPLIED>
]>
<!--
  Configure ImageMagick policies.

  Domains include system, delegate, coder, filter, path, or resource.

  Rights include none, read, write, execute and all.  Use | to combine them,
  for example: "read | write" to permit read from, or write to, a path.

  Use a glob expression as a pattern.

  Suppose we do not want users to process MPEG video images:

    <policy domain="delegate" rights="none" pattern="mpeg:decode" />

  Here we do not want users reading images from HTTP:

    <policy domain="coder" rights="none" pattern="HTTP" />

  The /repository file system is restricted to read only.  We use a glob
  expression to match all paths that start with /repository:

    <policy domain="path" rights="read" pattern="/repository/*" />

  Lets prevent users from executing any image filters:

    <policy domain="filter" rights="none" pattern="*" />

  Any large image is cached to disk rather than memory:

    <policy domain="resource" name="area" value="1GP"/>

  Define arguments for the memory, map, area, width, height and disk resources
  with SI prefixes (.e.g 100MB).  In addition, resource policies are maximums
  for each instance of ImageMagick (e.g. policy memory limit 1GB, -limit 2GB
  exceeds policy maximum so memory limit is 1GB).

  Rules are processed in order.  Here we want to restrict ImageMagick to only
  read or write a small subset of proven web-safe image types:

    <policy domain="delegate" rights="none" pattern="*" />
    <policy domain="filter" rights="none" pattern="*" />
    <policy domain="coder" rights="none" pattern="*" />
    <policy domain="coder" rights="read|write" pattern="{GIF,JPEG,PNG,WEBP}" />
-->
<policymap>
  <!-- <policy domain="system" name="shred" value="2"/> -->
  <!-- <policy domain="system" name="precision" value="6"/> -->
  <!-- <policy domain="system" name="memory-map" value="anonymous"/> -->
  <!-- <policy domain="system" name="max-memory-request" value="256MiB"/> -->
  <!-- <policy domain="resource" name="temporary-path" value="/tmp"/> -->
  <policy domain="resource" name="memory" value="256MiB"/>
  <policy domain="resource" name="map" value="512MiB"/>
  <policy domain="resource" name="width" value="16KP"/>
  <policy domain="resource" name="height" value="16KP"/>
  <!-- <policy domain="resource" name="list-length" value="128"/> -->
  <policy domain="resource" name="area" value="128MB"/>
  <policy domain="resource" name="disk" value="1GiB"/>
  <!-- <policy domain="resource" name="file" value="768"/> -->
  <!-- <policy domain="resource" name="thread" value="4"/> -->
  <!-- <policy domain="resource" name="throttle" value="0"/> -->
  <!-- <policy domain="resource" name="time" value="3600"/> -->
  <!-- <policy domain="coder" rights="none" pattern="MVG" /> -->
  <!-- <policy domain="module" rights="none" pattern="{PS,PDF,XPS}" /> -->
  <!-- <policy domain="delegate" rights="none" pattern="HTTPS" /> -->
  <!-- <policy domain="path" rights="none" pattern="@*" /> -->
  <!-- <policy domain="cache" name="memory-map" value="anonymous"/> -->
  <!-- <policy domain="cache" name="synchronize" value="True"/> -->
  <!-- <policy domain="cache" name="shared-secret" value="passphrase" stealth="true"/> -->
  <!-- <policy domain="system" name="pixel-cache-memory" value="anonymous"/> -->
  <!-- <policy domain="system" name="shred" value="2"/> -->
  <!-- <policy domain="system" name="precision" value="6"/> -->
  <!-- not needed due to the need to use explicitly by mvg: -->
  <!-- <policy domain="delegate" rights="none" pattern="MVG" /> -->
  <!-- use curl -->
  <policy domain="delegate" rights="none" pattern="URL" />
  <policy domain="delegate" rights="none" pattern="HTTPS" />
  <policy domain="delegate" rights="none" pattern="HTTP" />
  <!-- in order to avoid to get image with password text -->
  <!-- disable ghostscript format types -->
  <policy domain="coder" rights="none" pattern="PS" />
  <policy domain="coder" rights="none" pattern="PS2" />
  <policy domain="coder" rights="none" pattern="PS3" />
  <policy domain="coder" rights="none" pattern="EPS" />
  <policy domain="coder" rights="none" pattern="PDF" />
  <policy domain="coder" rights="none" pattern="XPS" />
</policymap>"""

with open("/etc/ImageMagick-6/policy.xml", "w") as f:
    f.write(xml_string)

In [None]:
#@title Import packages
import cv2, openai, os, json, moviepy, string, torch, urllib.request, soundfile as sf, wave, numpy as np, nltk, math, scipy, shutil, pathlib, transformers, gc

from moviepy.editor import *
from moviepy.video.io.VideoFileClip import VideoFileClip
from moviepy.audio.io.AudioFileClip import AudioFileClip
from moviepy.video.tools.subtitles import SubtitlesClip
from elevenlabs import voices, generate, set_api_key
from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler
from diffusers.utils import export_to_video
from diffusers import MusicLDMPipeline
from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler
from huggingface_hub import snapshot_download




In [None]:
#@title Insert your openai and elevelabs api tokens.
openai_key = ''#@param {type:"string"}
elevenlabstoken = ""#@param {type:"string"}



In [None]:
#@title set up pipelines. Chose modelscope or zeroscope (better resolution)
model = "Zeroscope" #@param ["Modelscope", "Zeroscope"]

if model == 'Zeroscope':
  pipe = DiffusionPipeline.from_pretrained("cerspense/zeroscope_v2_576w", torch_dtype=torch.float16)
elif model == 'Modelscope':
# load pipeline for text to video generation
  pipe = DiffusionPipeline.from_pretrained("damo-vilab/text-to-video-ms-1.7b", torch_dtype=torch.float16, variant="fp16")

pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)

# optimize for GPU memory
pipe.enable_model_cpu_offload()
pipe.enable_vae_slicing()

#loads pipeline for text to music
repo_id = "cvssp/musicldm"
music_pipe = MusicLDMPipeline.from_pretrained(repo_id, torch_dtype=torch.float16)
music_pipe = music_pipe.to("cuda")

#helper for parsing text
nltk.download('punkt')

In [None]:
#@title Define functions

#returns list of cast
def persons(movie_script):
    data = json.loads(movie_script)
    persons = []
    for item in data:
      # Iterate over each item in the dialogue list
      for dialogue_item in item["dialogue"]:
        # Print the speaker and text for each item
        key = dialogue_item['person']
        persons.append(key)
    persons = list(dict.fromkeys(persons))


    return persons

# Sends the elevenlabs list of voice actors and casts the caracters. It tends to do a good job with both age, gender and description,
def casting(persons):
  response = openai.ChatCompletion.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": 'Here is a list of descriptions of characters. ' + str(all_voices)  + ' I will give you a list of characters names. I want you to assign a suitable voice actor to each of the characters and return only text in a json format: [{"character":"Han Solo", "actor": "Clyde"}]'},
                {"role": "user", "content": str(persons)},
            ]
        )
  message = response.choices[0]["message"]["content"]
  return message

#Function for creating scene from prompt. default is 8 frames per second, 40 frames = five seconds
#def createScene(prompt, duration):
#  video_frames = pipe(prompt, num_inference_steps=25, num_frames=duration).frames
#  # convert to video
#  video_path = export_to_video(video_frames)
#  return video_path

#def createScene(prompt, duration):
#  video_frames = pipe(prompt, num_inference_steps=25, height=320, width=576, num_frames=duration).frames
#  # convert to video
#  video_path = export_to_video(video_frames)
#  torch.cuda.empty_cache()
#
#  # Collect garbage
#  gc.collect()
#  return video_path
#
#  return video_path

def createScene(prompt, duration):
    while True:
        print('Trying to create clip with duration: ' + str(duration))
        try:
            video_frames = pipe(prompt, num_inference_steps=25, height=320, width=576, num_frames=duration).frames
            break
        except:
            print('failed at duration : ' + str(duration) + '. reducing duration by 10 percent.')
            duration = int(duration * 0.9)
    # convert to video
    video_path = export_to_video(video_frames)
    # Collect garbage
    torch.cuda.empty_cache()
    gc.collect()
    return video_path


# Generate an audio object from the voice name and the text
def generateSpeech(line, voice, output):
  audio = generate(text=line,voice=voice)
  with open(output + ".wav", "wb") as f:
    f.write(audio)
  return

# The model i use can sometimes return just a noisy pattern.
# The following tries to remedy this by checking if the returned video is just noise
# If too much noise, it retries up to n times
def is_video_noisy(video_path,threshold):
    cap = cv2.VideoCapture(video_path)
    lap = 0
    iter = 1
    while cap.isOpened():
        ret, frame = cap.read()

        if not ret:
            break
        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        laplacian = cv2.Laplacian(gray, cv2.CV_64F).var()
        iter +=1
        lap += laplacian

    avg_laplacian = lap/iter
    print("avg laplacian:" + str(avg_laplacian))
    if avg_laplacian > threshold:
      return True, avg_laplacian
    else:
      return False, avg_laplacian

def test_for_noise(line, max_tries, duration, noise_threshold):
  min_laplacian = float('inf')
  min_video = None
  for i in range(max_tries):
    print("try number " + str(i))
    new_scene = createScene(line,duration)
    video = VideoFileClip(new_scene)
    filename = clipname + "_tmp" + str(i) + ".mp4"
    video.write_videofile(filename, fps=video.fps)
    bool_, avg_lap = is_video_noisy(filename, noise_threshold)
    if avg_lap < min_laplacian:
      min_laplacian = avg_lap
      min_video = video
    if not bool_:
      print("video passed noise threshold!")
      return min_video
    if bool_:
      print("video is too noisy, trying again")
      if i == max_tries - 1:
        print("Was not able to create a clip in " + str(max_tries) + " attempts")

  return min_video

# Funcion for generating music
def generateMusic(music,length,scene):
  audio = music_pipe(music, num_inference_steps=200, audio_length_in_s=length).audios[0]
  scipy.io.wavfile.write("Music_" + str(scene) + ".wav", rate=16000, data=audio)
  return

def generateSilence(silence_duration):
  # Creates a silent wav file for padding

  filename = "silence.wav"
  nchannels = 1
  sampwidth = 2
  framerate = 44100
  nframes = int(framerate * silence_duration)

  with wave.open(filename, "w") as f:
      f.setnchannels(nchannels)
      f.setsampwidth(sampwidth)
      f.setframerate(framerate)
      f.setnframes(nframes)

      # Remove audio data
      data = np.zeros(nframes, dtype=np.int16)
      f.writeframes(data.tobytes())
  silence = AudioFileClip("silence.wav")
  return silence



In [None]:
#@title Create movie
#@markdown Enter a desired movie title.
movietitle =  "Game of Thrones - The Kraken returns" #@param {type:"string"}
video_model = 'zeroscope'#@param {type:"string"}
#@markdown To get good results, use a description of familiar casts for the movie. A generic John doe will be rendered differently each scene, but Luke Skywalker will be more consistent.
plot = "A familiar cast of characters from the game of thrones universe reconcile their differences to assemble a great army and defeat the Kraken"#@param {type:"string"}

#@markdown Generating a 5 scene movie can take about half an hour on a T4 GPU, or 6 minutes on an A100, and give a 1-2 minute movie depending on the plot. I have not tried much longer movies

scene_count = 5#@param {type:"number"}
prompt = "The title of the movie is" + movietitle + "The plot of the movie: " + plot
systemprompt = 'I want you to write a movie script for a short movie. You should provide a funny, comedic script that is original and surprising. I want you to answer in a nested json format. Always use the whole name, e.g. Jon Snow, never just Jon, Snow or he, to describe the scene. The movie should have at least ' + str(scene_count) + ' scenes. Describe in detail what happens in each scene. Each dialogue item should only contain one sentance. If a character says several sentences in a row, each line should be its own item. Desired output is this json format:[{"scene": "description of the scene","music": "description of the music","dialogue": [{"person": "Person A","line":"The first line person A says"},{"person":"Person A","line":"The second line Person A says"}, {"person": ...}]},{"scene": ...}]'
set_api_key(elevenlabstoken)

# Adjust the systemprompt text if you wish to experiment with the format.
# Sometimes, the response is not properly formatted.
# I implemented a loop to send back the answer if it is not proper json, which run until valid, or timeout after 10 tries.
# This version uses GPT4. GTP3.5-Turbo is faster and might provide sufficient results.
# This was originally developed using the davinci-002 model (GPT-3).
# One issue is that a scene descripiton might reference he or she, not the actual character name.
# This causes issues for generating video. I have tried instructing GPT to not do this, but it still does.
# This can be fixed by furter system prompting or implementing a check for he/she/they, that returns the prompt and ask GPT to replace with the names, but this is not yet implemented

def generate_movie_script(prompt, counter=0):
    print("counter: " + str(counter))
    if counter == 10:
        print("Sorry, I could not generate a valid response after 10 attempts.")
        return "Sorry, I could not generate a valid response after 10 attempts."
    openai.api_key = openai_key
    response = openai.ChatCompletion.create(
            model="gpt-4",
            messages=[
                {"role": "user", "content": prompt},
                {"role": "system", "content": systemprompt},

            ]
        )
    message = response.choices[0]["message"]["content"]
    print(message)
    if not message:
        return generate_movie_script(prompt, counter + 1)
    try:
        json_obj = json.loads(message)
    except ValueError:
        print('reply not valid json, asking gpt to try again')
        prompt += "\nThe reply you gave was not valid JSON. Can you please format the response as I have instructed you?"
        return generate_movie_script(prompt, counter + 1)
    return message

movie_script = generate_movie_script(prompt)

#get all persons from script
persons = persons(movie_script)
print(persons)

# Get a list of all premade voices from elevenlabs - along with descriptions of the voices
all_voices = voices()

# Extracting the relevant information to pass to GTP-4 (using all the info expends unnecessarily many tokens)
voices_list = []
for voice in all_voices:
  voice_dict = {}
  voice_dict["name"] = str(voice.name)
  voice_dict["accent"] = str(voice.labels["accent"])
  voice_dict["description"] = str(voice.labels["description"]) if "description" in voice.labels else ""
  voice_dict["age"] = str(voice.labels["age"])
  voice_dict["gender"] = str(voice.labels["gender"])
  voice_dict["use_case"] = str(voice.labels["use case"]) if "use case" in voice.labels else ""
  voices_list.append(voice_dict)
voices_json = json.dumps(voices_list)
#print(voices_json)

# Sends the elevenlabs list of voice actors and casts the caracters. It tends to do a good job with both age, gender and description,
cast = casting(persons)

print(cast)

#reformats the response for later use
json_object = json.loads(cast)
voicematch = {item["character"]:item["actor"] for item in json_object}
print(voicematch)

#the shit

#@markdown Noise threshold determines the threshold for re-trying video generation.
noise_threshold = 800#@param {type:"number"}

#@markdown Max_retries determies the maximum number of retries before moving on to the next scene
max_retries = 5#@param {type:"number"}
generator = lambda txt: TextClip(txt, font='Georgia-Regular', size = (1800,100), fontsize=24, color='white', method='caption')
data=json.loads(movie_script)
scenenumber = 0
clips = {}
scene_cuts = {}
scene_lengths = {}
dialouges = []
soundfiles = []
musicfiles = []
rendered_scenes = []
movielength = 0
for item in data:
  for key, value in item.items():
    substring = "scene"
    if substring.lower() in key.lower():
      scenenumber += 1
      scene_duration = 0
      cuts = 0
      #print(value)
      lines = nltk.sent_tokenize(value)
      line_count = len(lines)
      print("scene" + str(scenenumber) + " has " + str(line_count) + " clips")
      print("scene is " + str(line_count*5) + " seconds long")
      for line in lines:
        if len(line) > 0:
          cuts += 1
          clipname = f"scene_{scenenumber}_" + f"cut_{cuts}"
          prompt = line
          clips.update({clipname:prompt})
          print("creating video: " + line)
          #40 frames, 8 fps, = 5 sek
          duration = 40

          video=test_for_noise(line,max_retries, duration, noise_threshold)
          video = video.resize(height=1080)
          start_time = 0
          end_time = duration/8
          sub = [((start_time, end_time),line)]
          captions = SubtitlesClip(sub, generator)
          final = CompositeVideoClip([video, captions.set_pos(('center','top'))])
          final.write_videofile(clipname + ".mp4", fps=video.fps)
          rendered_scenes.append(final)
          movielength += duration
          scene_duration += duration

      for dialogue_item in item["dialogue"]:
          # Print the speaker and text for each item
          cuts += 1
          key = dialogue_item['person']
          value = dialogue_item['line']
          line = key + ' : ' + value
          voice = voicematch[key]
          print("person " +  key + " will use the voice " + voice + " to say " + value)
          clipname = f"scene_{scenenumber}_" + f"cut_{cuts}"

          generateSpeech(value, voice, clipname)
          soundfile=AudioFileClip(clipname + ".wav")
          soundfile.write_audiofile(clipname + "_s.wav")
          f = sf.SoundFile(clipname + "_s.wav")
          cliplength = (len(f)/f.samplerate)

          soundclips = []
          if cliplength <= 40:
            silence = generateSilence(0.1)
          elif cliplength > 40:
            silence = generateSilence(1)

          soundclips.append(silence)
          soundclips.append(soundfile)
          soundclips.append(silence)
          combined = concatenate_audioclips(soundclips)
          combined.write_audiofile(clipname + "_s.wav")

          f = sf.SoundFile(clipname + "_s.wav")
          cliplength = (len(f)/f.samplerate)

          duration = math.ceil(cliplength*8)
          print ("dialouge is " + str(cliplength) + " seconds and " + str(duration)  + " frames long")
          clipname = f"scene_{scenenumber}_" + f"cut_{cuts}"
          prompt = f"midshot of {key} talking"
          clips.update({clipname:prompt})
          print("creating video "+ clipname + ". with prompt: " + prompt)
          video=test_for_noise(prompt,max_retries, duration, noise_threshold)

          dialogue = AudioFileClip(clipname + ".wav")
          video = video.set_audio(dialogue)
          video = video.resize(height=1080)
          start_time = 0
          end_time = cliplength
          sub = [((start_time, end_time),line)]

          subtitles = SubtitlesClip(sub, generator)

          final = CompositeVideoClip([video, subtitles.set_pos(('center','bottom'))])
          final.write_videofile(clipname + ".mp4", fps=video.fps)

          rendered_scenes.append(final)
          movielength += duration
          scene_duration += duration
    substring_ = "music"
    if substring_.lower() in key.lower():
      length = scene_duration/8
      print("Scene " + str(scenenumber) + " is " + str(scene_duration) + " frames long/" + str(length) + " seconds long" )
      print("The music prompt:" + value)
      generateMusic(value,length, scenenumber)
      musicfile = "Music_" + str(scenenumber)
      musicfiles.append(musicfile)


  scene_lengths.update({scenenumber:scene_duration})


music_append = []
for music in musicfiles:
    musicfile = AudioFileClip(music + ".wav")
    music_append.append(musicfile)

combined = concatenate_audioclips(music_append)

combined.write_audiofile("soundtrack.wav")

final = concatenate_videoclips(rendered_scenes)
final.write_videofile( movietitle + "_.mp4")
video = VideoFileClip(movietitle + "_.mp4")
soundtrack = AudioFileClip("soundtrack.wav").volumex(0.1)
final_audio = CompositeAudioClip([video.audio,soundtrack])
final_video = video.set_audio(final_audio)
final_video.write_videofile(movietitle + ".mp4")


In [None]:
#mount drive to save file straight to google drive, or just download it
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#Run this if you dont have the folder already
!mkdir -p "/content/drive/My Drive/Text-to-movie"

In [None]:
#@title copy movie to text-to-movie folder in google drive
shutil.copy('/content/' + movietitle + ".mp4",'/content/drive/My Drive/Text-to-movie/' + movietitle + ".mp4")