### Import libraries

In [1]:
from datasets import load_dataset
import pandas as pd
import numpy as np
import re
from nltk.tokenize import word_tokenize

### Download dataset and transform into dataframe

In [2]:
df = load_dataset("fka/awesome-chatgpt-prompts")

In [3]:
df = pd.DataFrame(df["train"])

df.head()

Unnamed: 0,act,prompt
0,An Ethereum Developer,Imagine you are an experienced Ethereum develo...
1,SEO Prompt,"Using WebPilot, create an outline for an artic..."
2,Linux Terminal,I want you to act as a linux terminal. I will ...
3,English Translator and Improver,"I want you to act as an English translator, sp..."
4,`position` Interviewer,I want you to act as an interviewer. I will be...


### Check dataframe information 

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 203 entries, 0 to 202
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   act     203 non-null    object
 1   prompt  203 non-null    object
dtypes: object(2)
memory usage: 3.3+ KB


### Data Cleaning

In [5]:
def clean_text(text):
    # Convert to lowercase
    if isinstance(text, str):
        text = text.lower()
        return text
    return text

def clean_punctuation(text):
    # Remove punctuation characters
    if isinstance(text, str):
        text = re.sub(r'[^\s\w]', '', text)
        text = re.sub(r'`', '', text)
        return text
    return text

df['act'] = df['act'].apply(clean_text).apply(clean_punctuation)
df['prompt'] = df['prompt'].apply(clean_text).apply(clean_punctuation)

### Data Preprocessing

In [6]:
df['act'] = df['act'].apply(word_tokenize)
df['prompt'] = df['prompt'].apply(word_tokenize)

In [7]:
df.head()

Unnamed: 0,act,prompt
0,"[an, ethereum, developer]","[imagine, you, are, an, experienced, ethereum,..."
1,"[seo, prompt]","[using, webpilot, create, an, outline, for, an..."
2,"[linux, terminal]","[i, want, you, to, act, as, a, linux, terminal..."
3,"[english, translator, and, improver]","[i, want, you, to, act, as, an, english, trans..."
4,"[position, interviewer]","[i, want, you, to, act, as, an, interviewer, i..."
