In [None]:
!pip install transformers
!pip install taipy

import tensorflow as tf
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import time
from transformers import AutoTokenizer
from taipy.gui import Gui
from tensorflow import keras
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers
from sklearn.utils import shuffle



In [None]:
df = pd.read_csv("final.csv")

In [None]:
df.shape

(1150, 3)

In [None]:
df.head()

Unnamed: 0,Patterns,Target,Target_class
0,The column col1 has unique values.,df['col1'].nunique(),0
1,The column col1 has all unique values.,df['col1'].nunique(),0
2,"In the column col1, display all unique values.",df['col1'].nunique(),0
3,"In the column col1, display all the unique val...",df['col1'].nunique(),0
4,The column col1 contains all unique values.,df['col1'].nunique(),0


In [None]:
df = shuffle(df)

In [None]:
df.head()

Unnamed: 0,Patterns,Target,Target_class
302,The dataframe is sorted by col1 and col2.,"df.sort_values(by=['col1', 'col2'],inplace =True)",2
772,Is there a list of col1 and col2.,"new_df=df.loc[:, ['col1','col2']]",5
585,The col1 column was dropped.,"df.drop(columns = 'col1',inplace = True)",4
484,Let's drop the col1 column.,"df.drop(columns = 'col1',inplace = True)",4
1037,You can display a bar chart of col1 and col2,<|{dataset}|chart|type=bar|x=col1|y=col2|heigh...,7


In [None]:
num_classes = df.Target.unique().size
print(num_classes)

10


In [None]:
train_size = int(df.shape[0] * 0.6)
val_size = int(df.shape[0] * 0.2)
test_size = int(df.shape[0] * 0.2)
train_df = df[:train_size]
val_df = df[train_size:train_size+val_size]
test_df = df[train_size + val_size:]

train_sentences = train_df.Patterns.to_numpy()
train_labels = train_df.Target_class.to_numpy()
val_sentences = val_df.Patterns.to_numpy()
val_labels = val_df.Target_class.to_numpy()
test_sentences = test_df.Patterns.to_numpy()
test_labels = test_df.Target_class.to_numpy()

In [None]:
train_sentences.shape, val_sentences.shape, test_sentences.shape

((690,), (230,), (230,))

In [None]:
model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [None]:
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

In [None]:
encoded_text = tokenizer(train_sentences[0], padding=True, truncation=True)
encoded_text

{'input_ids': [101, 1996, 2951, 15643, 2003, 19616, 2011, 8902, 2487, 1998, 8902, 2475, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
tokens = tokenizer.convert_ids_to_tokens(encoded_text.input_ids)
tokens

['[CLS]',
 'the',
 'data',
 '##frame',
 'is',
 'sorted',
 'by',
 'col',
 '##1',
 'and',
 'col',
 '##2',
 '.',
 '[SEP]']

In [None]:
text = tokenizer.convert_tokens_to_string(tokens)
text

'[CLS] the dataframe is sorted by col1 and col2. [SEP]'

In [None]:
def tokenize(batch):
    toReturn = [None] * len(batch)
    for i in range(len(batch)):
        toReturn[i] = tokenizer(batch[i], padding=True, truncation=True)
    return toReturn

In [None]:
train_encoded = tokenize(train_sentences)
val_encoded = tokenize(val_sentences)
test_encoded = tokenize(test_sentences)

In [None]:
def sequences(tokenized_arr):
    toReturn = [None] * len(tokenized_arr)
    for i in range(len(tokenized_arr)):
        toReturn[i] = tokenized_arr[i].input_ids
    return toReturn

In [None]:
train_sequences = sequences(train_encoded)
val_sequences = sequences(val_encoded)
test_sequences = sequences(test_encoded)

In [None]:
def sequencesToToken(sequences):
    toReturn = [None] * len(sequences)
    for i in range(len(sequences)):
        toReturn[i] = tokenizer.convert_ids_to_tokens(sequences[i])
    return toReturn

In [None]:
train_tokens = sequencesToToken(train_sequences)

In [None]:
print(train_sentences[10:15])
print(train_sequences[10:15])
print(train_tokens[10:15])

['The dataframe should be sorted by col1 and col2.'
 'A line chart of col1 and col2.' 'Cut the col1 column in half.'
 'Drop the col1 column, please.' 'The dataframe can besorted by col1']
[[101, 1996, 2951, 15643, 2323, 2022, 19616, 2011, 8902, 2487, 1998, 8902, 2475, 1012, 102], [101, 1037, 2240, 3673, 1997, 8902, 2487, 1998, 8902, 2475, 1012, 102], [101, 3013, 1996, 8902, 2487, 5930, 1999, 2431, 1012, 102], [101, 4530, 1996, 8902, 2487, 5930, 1010, 3531, 1012, 102], [101, 1996, 2951, 15643, 2064, 2022, 21748, 3064, 2011, 8902, 2487, 102]]
[['[CLS]', 'the', 'data', '##frame', 'should', 'be', 'sorted', 'by', 'col', '##1', 'and', 'col', '##2', '.', '[SEP]'], ['[CLS]', 'a', 'line', 'chart', 'of', 'col', '##1', 'and', 'col', '##2', '.', '[SEP]'], ['[CLS]', 'cut', 'the', 'col', '##1', 'column', 'in', 'half', '.', '[SEP]'], ['[CLS]', 'drop', 'the', 'col', '##1', 'column', ',', 'please', '.', '[SEP]'], ['[CLS]', 'the', 'data', '##frame', 'can', 'be', '##sor', '##ted', 'by', 'col', '##1', '[S

In [None]:
max_length = 20

train_padded = pad_sequences(train_sequences, maxlen=max_length, padding='post', truncating='post')
val_padded = pad_sequences(val_sequences, maxlen=max_length, padding='post', truncating='post')
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding='post', truncating='post')
train_padded.shape, val_padded.shape, test_padded.shape

((690, 20), (230, 20), (230, 20))

In [None]:
train_padded[10:15]

array([[  101,  1996,  2951, 15643,  2323,  2022, 19616,  2011,  8902,
         2487,  1998,  8902,  2475,  1012,   102,     0,     0,     0,
            0,     0],
       [  101,  1037,  2240,  3673,  1997,  8902,  2487,  1998,  8902,
         2475,  1012,   102,     0,     0,     0,     0,     0,     0,
            0,     0],
       [  101,  3013,  1996,  8902,  2487,  5930,  1999,  2431,  1012,
          102,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0],
       [  101,  4530,  1996,  8902,  2487,  5930,  1010,  3531,  1012,
          102,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0],
       [  101,  1996,  2951, 15643,  2064,  2022, 21748,  3064,  2011,
         8902,  2487,   102,     0,     0,     0,     0,     0,     0,
            0,     0]])

In [None]:
model = keras.models.Sequential()
model.add(layers.Embedding(tokenizer.vocab_size, 32, input_length=max_length))

model.add(layers.LSTM(64, dropout=0.1))
model.add(layers.Dense(num_classes, activation="softmax"))

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 20, 32)            976704    
                                                                 
 lstm (LSTM)                 (None, 64)                24832     
                                                                 
 dense (Dense)               (None, 10)                650       
                                                                 
Total params: 1,002,186
Trainable params: 1,002,186
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001),loss='sparse_categorical_crossentropy',metrics=['accuracy'])

In [None]:
checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath='fyp.h5', mode='max', monitor='val_accuracy', verbose=2, save_best_only=True, save_weights_only=False)
callbacks_list = [checkpoint]
model.fit(train_padded, train_labels, epochs=20, validation_data=(val_padded, val_labels), verbose=2, callbacks=callbacks_list)

Epoch 1/20

Epoch 1: val_accuracy improved from -inf to 0.16957, saving model to fyp.h5
22/22 - 4s - loss: 2.2418 - accuracy: 0.1652 - val_loss: 2.0659 - val_accuracy: 0.1696 - 4s/epoch - 162ms/step
Epoch 2/20

Epoch 2: val_accuracy improved from 0.16957 to 0.19565, saving model to fyp.h5
22/22 - 1s - loss: 1.9655 - accuracy: 0.2043 - val_loss: 1.6998 - val_accuracy: 0.1957 - 904ms/epoch - 41ms/step
Epoch 3/20

Epoch 3: val_accuracy improved from 0.19565 to 0.60870, saving model to fyp.h5
22/22 - 1s - loss: 1.5507 - accuracy: 0.3333 - val_loss: 1.2532 - val_accuracy: 0.6087 - 744ms/epoch - 34ms/step
Epoch 4/20

Epoch 4: val_accuracy improved from 0.60870 to 0.64783, saving model to fyp.h5
22/22 - 1s - loss: 1.1280 - accuracy: 0.5406 - val_loss: 0.9901 - val_accuracy: 0.6478 - 848ms/epoch - 39ms/step
Epoch 5/20

Epoch 5: val_accuracy improved from 0.64783 to 0.77391, saving model to fyp.h5
22/22 - 1s - loss: 0.8240 - accuracy: 0.7420 - val_loss: 0.6685 - val_accuracy: 0.7739 - 832ms/epo

<keras.callbacks.History at 0x1b70ef85ac0>

In [None]:
model_json = model.to_json()
with open("fyp.json", "w") as json_file:
    json_file.write(model_json)

In [None]:
predictions = model.predict(test_padded)



In [None]:
predictions_int = np.argmax(predictions, axis=1)

In [None]:
print(test_sentences[10:15])
print(test_labels[10:15])
print(predictions_int[10:15])

['You can display all unique values in the column col1.'
 'Tell me how many values are in col1' 'Use col 1 to sort the dataframe.'
 'The pie chart has col1 and col2 in it.'
 'Look at a line chart of col1 and col2.']
[0 6 1 8 9]
[0 6 1 8 9]


In [None]:
test_loss, test_acc = model.evaluate(test_padded, test_labels)



In [None]:
target_arr = ["df['col1'].nunique()",
             "df.sort_values(by=['col1'],inplace =True)",
             "df.sort_values(by=['col1', 'col2'],inplace =True)",
             "df.sort_values(by=['col1', 'col2', 'col3'],inplace =True)",
             "df.drop(columns = 'col1',inplace = True)",
             "new_df=df.loc[:, ['col1','col2']]",
             "df['col1'].value_counts()",
             "<|{dataset}|chart|type=bar|x=col1|y=col2|height=100%|>",
             "<|{dataset}|chart|type=pie|values=col2|labels=col1|height=100%|>",
             "<|{dataset}|chart|mode=lines|x=col1|y=col2|>"]

In [None]:
df = pd.read_csv("sample_dataframe.csv")
df

Unnamed: 0,ID,Name,Salary,Age,Position,Address
0,1,Tom,50000,28,Analyst,Something
1,2,Harry,80000,32,Director,Something
2,3,Mary,20000,24,Intern,Anything
3,4,Jim,100000,40,Senior Manager,Nothing
4,5,Lily,95000,38,Manager,Something


In [None]:
df = pd.read_csv("sample_dataframe.csv")
df

Unnamed: 0,ID,Name,Salary,Age,Position,Address
0,1,Tom,50000,28,Analyst,Something
1,2,Harry,80000,32,Director,Something
2,3,Mary,20000,24,Intern,Anything
3,4,Jim,100000,40,Senior Manager,Nothing
4,5,Lily,95000,38,Manager,Something


In [None]:
cols = df.columns
cols

Index(['ID', 'Name', 'Salary', 'Age', 'Position', 'Address'], dtype='object')

In [None]:
sentence = "Show me a pie chart of Salary for Name"
# print(sentence.split())
words = sentence.split()
cols_requested = []
for item in cols:
  # print(item)
  for word in words:
    if(item.upper() == word.upper()):
      cols_requested.append(item)

cols_requested

['Name', 'Salary']

In [None]:
general_sentence = sentence
for i in range(len(cols_requested)):
    general_sentence = general_sentence.replace(cols_requested[i], "col" + str(i+1))
general_sentence

'Show me a pie chart of col2 for col1'

In [None]:
def tokenize(batch):
    return tokenizer(batch, padding=True, truncation=True)

In [None]:
test_sentence = [general_sentence]
test_sequence = tokenize(test_sentence).input_ids
test_sequence

[[101, 2265, 2033, 1037, 11345, 3673, 1997, 8902, 2475, 2005, 8902, 2487, 102]]

In [None]:
max_length = 20

test_padded = pad_sequences(test_sequence, maxlen=max_length, padding='post', truncating='post')
test_padded

array([[  101,  2265,  2033,  1037, 11345,  3673,  1997,  8902,  2475,
         2005,  8902,  2487,   102,     0,     0,     0,     0,     0,
            0,     0]])

In [None]:
prediction = model.predict(test_padded)
prediction_int = np.argmax(prediction, axis=1)
prediction_int[0]



9

In [None]:
if prediction_int[0] < 7:
    panda_query = target_arr[prediction_int[0]]
    for i in range(len(cols_requested)):
        panda_query = panda_query.replace("col" + str(i+1), cols_requested[i])
    print(panda_query)
    exec(panda_query)
else:
    taipy_query = target_arr[prediction_int[0]]
    dataset = df
    for i in range(len(cols_requested)):
        taipy_query = taipy_query.replace("col" + str(i+1), cols_requested[i])
    taipy_query

    page = """{0}"""
    page = page.format(taipy_query)
    print(page)
    gui = Gui(page)
    gui.run()

<|{dataset}|chart|mode=lines|x=Name|y=Salary|>
[2023-04-28 14:22:49,084][Taipy][INFO] Running in 'single_client' mode in notebook environment
[2023-04-28 14:22:49,440][Taipy][INFO]  * Server starting on http://127.0.0.1:5000




In [None]:
df

Unnamed: 0,ID,Name,Salary,Age,Position,Address
0,1,Tom,50000,28,Analyst,Something
1,2,Harry,80000,32,Director,Something
2,3,Mary,20000,24,Intern,Anything
3,4,Jim,100000,40,Senior Manager,Nothing
4,5,Lily,95000,38,Manager,Something


