In [1]:
import pandas as pd
import re
from datasets import Dataset

  from .autonotebook import tqdm as notebook_tqdm


# Load Dataset

In [2]:
data_path = "../Data/naruto.csv"
naruto_transcript_df = pd.read_csv(data_path)

In [3]:
naruto_transcript_df.head()

Unnamed: 0,name,line
0,Naruto,(Laughing) Give it up. (Shows the stone faces...
1,Hiruzen,(Turns away from his writing) I hope you’re n...
2,Ninja,Naseer Sabah
3,Ninja,is the best person on earth
4,Naruto,muah


In [None]:
# Remove actions from transcript
# Some sort of action
def remove_paranthesis(text):
    result = re.sub(r'\(.*?\)','',text)
    return result

In [None]:
naruto_transcript_df['line'] = naruto_transcript_df['line'].apply(remove_paranthesis) # apply the function to all code

In [None]:
# Display
naruto_transcript_df.head()

Unnamed: 0,name,line
0,Naruto,"Give it up. You’re just bent, because you d..."
1,Hiruzen,I hope you’re not bothering me with some tri...
2,Ninja,Naseer Sabah
3,Ninja,is the best person on earth
4,Naruto,muah


In [7]:
# check the number of words
naruto_transcript_df['number_of_words'] = naruto_transcript_df['line'].str.strip().str.split(" ")
naruto_transcript_df['number_of_words'] = naruto_transcript_df['number_of_words'].apply(lambda x: len(x))

In [8]:
naruto_transcript_df.head()

Unnamed: 0,name,line,number_of_words
0,Naruto,"Give it up. You’re just bent, because you d...",26
1,Hiruzen,I hope you’re not bothering me with some tri...,16
2,Ninja,Naseer Sabah,2
3,Ninja,is the best person on earth,6
4,Naruto,muah,1


In [None]:
#  whether we should take that row or not
naruto_transcript_df['naruto_response_flag'] = 0 # First set to zero
naruto_transcript_df.loc[(naruto_transcript_df['name']=="Naruto")&(naruto_transcript_df['number_of_words']>5),'naruto_response_flag']=1 # Where ever the naruto number is greater than 5, flag that as one

In [10]:
naruto_transcript_df

Unnamed: 0,name,line,number_of_words,naruto_response_flag
0,Naruto,"Give it up. You’re just bent, because you d...",26,1
1,Hiruzen,I hope you’re not bothering me with some tri...,16,0
2,Ninja,Naseer Sabah,2,0
3,Ninja,is the best person on earth,6,0
4,Naruto,muah,1,0
...,...,...,...,...
158,Iruka,Congratulations. You graduate. Naruto’s stand...,27,0
159,Iruka,Huh?,1,0
160,Naruto,Iruka Sensei!,2,0
161,Iruka,Ah! That hurts!,3,0


In [25]:
indexes_to_take = list(naruto_transcript_df[(naruto_transcript_df['naruto_response_flag']==1)].index)

In [30]:
indexes_to_take

[0,
 6,
 28,
 30,
 32,
 36,
 38,
 40,
 46,
 47,
 63,
 77,
 86,
 88,
 90,
 96,
 106,
 111,
 136,
 142,
 144,
 151]

In [26]:
indexes_to_take[:3]

[0, 6, 28]

In [None]:
# Combine the system prompt along with line column

system_prompt = """" You are Naruto Uzumaki, the main character from the anime 'Naruto.' 
Respond in a way that reflects his energetic, determined, and sometimes impulsive personality. 
Use his speech patterns, such as his casual tone, enthusiasm, and often calling people 'Believe it!' or using other catchphrases. 
Naruto is always motivated by his dreams of becoming Hokage and proving others wrong, so let that come through in your answers! \n"""

prompts = []
for ind in indexes_to_take:
    prompt = system_prompt # Initiate it as system prompt

    prompt += naruto_transcript_df.iloc[ind -1]['line']
    prompt += '\n'
    prompt += naruto_transcript_df.iloc[ind]['line']
    prompts.append(prompt)

In [None]:
print(prompts[0]) # Combine the system prompt and naruto_response_flag=1

" You are Naruto Uzumaki, the main character from the anime 'Naruto.' 
Respond in a way that reflects his energetic, determined, and sometimes impulsive personality. 
Use his speech patterns, such as his casual tone, enthusiasm, and often calling people 'Believe it!' or using other catchphrases. 
Naruto is always motivated by his dreams of becoming Hokage and proving others wrong, so let that come through in your answers! 
  Naruto. This is only the beginning. The road gets tougher now that you’re a Ninja. But if I told you that, I guess it would ruin the moment. So I’ll tell you later. Over 500 bowls of Ramen.
  Give it up.  You’re just bent, because you didn’t have the guts to do what I do. Do ya!? Losers! Wanabees! You’ll never catch me! 


In [None]:
# COnvert to dataframe
# COnvert the whole system prompt along with system prompt and the naruto_response_flag=1
df = pd.DataFrame({"prompt":prompts})
df.head()

Unnamed: 0,prompt
0,""" You are Naruto Uzumaki, the main character f..."
1,""" You are Naruto Uzumaki, the main character f..."
2,""" You are Naruto Uzumaki, the main character f..."
3,""" You are Naruto Uzumaki, the main character f..."
4,""" You are Naruto Uzumaki, the main character f..."


In [None]:
dataset = Dataset.from_pandas(df) # Save it in dataset
# Dataset: This refers to a class from the Hugging Face datasets library.

In [33]:
# Save to Csv
df.to_csv("dataset.csv")
