<a href="https://colab.research.google.com/github/RushikuwarRK/Video-Downloader/blob/main/python_code_generator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

# Load the dataset
df = pd.read_csv('text_to_python_code_dataset.csv')

# Tokenize the descriptions and code
tokenizer = Tokenizer(filters='')
tokenizer.fit_on_texts(df['Description'].tolist() + df['Code'].tolist())

# Convert text to sequences
description_sequences = tokenizer.texts_to_sequences(df['Description'])
code_sequences = tokenizer.texts_to_sequences(df['Code'])

# Pad the sequences
max_length_desc = max([len(seq) for seq in description_sequences])
max_length_code = max([len(seq) for seq in code_sequences])

description_padded = pad_sequences(description_sequences, maxlen=max_length_desc, padding='post')
code_padded = pad_sequences(code_sequences, maxlen=max_length_code, padding='post')

# Split the data into training and testing sets
desc_train, desc_test, code_train, code_test = train_test_split(description_padded, code_padded, test_size=0.2)


In [6]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding

# Define the model
embedding_dim = 256
latent_dim = 512
vocab_size = len(tokenizer.word_index) + 1

# Encoder
encoder_inputs = Input(shape=(max_length_desc,))
encoder_embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
_, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(max_length_code,))
decoder_embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Compile the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

# Display model summary
model.summary()


In [7]:
import numpy as np

# Prepare decoder input data by shifting the target sequence by one time step
code_train_input = code_train[:, :-1]
code_train_target = code_train[:, 1:]

# Ensure the shapes are correct
code_train_input = pad_sequences(code_train_input, maxlen=max_length_code - 1, padding='post')
code_train_target = pad_sequences(code_train_target, maxlen=max_length_code - 1, padding='post')

# Expand the target sequences to match the expected shape for the loss function
code_train_target = np.expand_dims(code_train_target, -1)

# Train the model
batch_size = 64
epochs = 50

model.fit([desc_train, code_train], code_train_target,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.2)



Epoch 1/50


ValueError: Arguments `target` and `output` must have the same shape up until the last dimension: target.shape=(None, 3), output.shape=(None, 4, 34)

In [8]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from sklearn.model_selection import train_test_split

# Load the dataset
df = pd.read_csv('text_to_python_code_dataset.csv')

# Tokenize the descriptions and code
tokenizer = Tokenizer(filters='')
tokenizer.fit_on_texts(df['Description'].tolist() + df['Code'].tolist())

# Convert text to sequences
description_sequences = tokenizer.texts_to_sequences(df['Description'])
code_sequences = tokenizer.texts_to_sequences(df['Code'])

# Pad the sequences
max_length_desc = max([len(seq) for seq in description_sequences])
max_length_code = max([len(seq) for seq in code_sequences]) + 1  # Add 1 for start/end token

description_padded = pad_sequences(description_sequences, maxlen=max_length_desc, padding='post')
code_padded = pad_sequences(code_sequences, maxlen=max_length_code, padding='post')

# Split the data into training and testing sets
desc_train, desc_test, code_train, code_test = train_test_split(description_padded, code_padded, test_size=0.2)

# Define the model
embedding_dim = 256
latent_dim = 512
vocab_size = len(tokenizer.word_index) + 1

# Encoder
encoder_inputs = Input(shape=(max_length_desc,))
encoder_embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
_, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(max_length_code,))
decoder_embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Compile the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

# Prepare decoder input data by shifting the target sequence by one time step
code_train_input = code_train[:, :-1]
code_train_target = code_train[:, 1:]

# Ensure the shapes are correct
code_train_input = pad_sequences(code_train_input, maxlen=max_length_code, padding='post')
code_train_target = pad_sequences(code_train_target, maxlen=max_length_code, padding='post')

# Expand the target sequences to match the expected shape for the loss function
code_train_target = np.expand_dims(code_train_target, -1)

# Train the model
batch_size = 64
epochs = 50

history = model.fit(
    [desc_train, code_train_input],
    code_train_target,
    batch_size=batch_size,
    epochs=epochs,
    validation_split=0.2
)


Epoch 1/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5s/step - loss: 3.5224 - val_loss: 3.4448
Epoch 2/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 685ms/step - loss: 3.4192 - val_loss: 3.3394
Epoch 3/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 318ms/step - loss: 3.2770 - val_loss: 3.1537
Epoch 4/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 585ms/step - loss: 3.0167 - val_loss: 2.7994
Epoch 5/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 298ms/step - loss: 2.5012 - val_loss: 2.1444
Epoch 6/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 286ms/step - loss: 1.7376 - val_loss: 1.5523
Epoch 7/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 322ms/step - loss: 1.5059 - val_loss: 1.6045
Epoch 8/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 598ms/step - loss: 1.6940 - val_loss: 1.5469
Epoch 9/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[

In [9]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [10]:
# Define the encoder model for inference
encoder_model = Model(encoder_inputs, encoder_states)

# Define the decoder model for inference
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

# Use a new variable name for the embedding layer in the decoder inference model
decoder_embedding_layer = Embedding(input_dim=vocab_size, output_dim=embedding_dim)
decoder_embedding2 = decoder_embedding_layer(decoder_inputs)  # Apply the embedding layer

decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_embedding2, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)

decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)

def decode_sequence(input_seq):
    # Encode the input as state vectors
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1
    target_seq = np.zeros((1, 1))
    # Populate the first character of target sequence with the start token
    target_seq[0, 0] = 1  # Assuming 1 is the index for the start token

    # Sampling loop for a batch of sequences
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_token = tokenizer.index_word.get(sampled_token_index, '')

        decoded_sentence += ' ' + sampled_token

        # Exit condition: either hit max length or find stop token
        if sampled_token == 'endtoken' or len(decoded_sentence) > max_length_code:
            stop_condition = True

        # Update the target sequence (of length 1)
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]

    return decoded_sentence

# Get user input
user_input = input("Describe the Python code you need: ")

# Preprocess user input
user_input_seq = tokenizer.texts_to_sequences([user_input])
user_input_padded = pad_sequences(user_input_seq, maxlen=max_length_desc, padding='post')

# Generate Python code
generated_code = decode_sequence(user_input_padded)
print("Generated Python Code:", generated_code)


Describe the Python code you need: display hello
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 232ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 259ms/step
Generated Python Code:  world")
