In [1]:
# Basic libraries
import pandas as pd
import numpy as np

# For text vectorization and similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Load both CSVs
df1 = pd.read_csv("chatgpt_prompts.csv")
df2 = pd.read_csv("prompts.csv")

# Drop the extra unnamed column in df1
df1 = df1.drop(columns=["Unnamed: 0"], errors="ignore")

# Combine datasets
df = pd.concat([df1, df2], ignore_index=True)

# Shuffle the data
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

print("Total prompts in dataset:", len(df))
df.head()

Total prompts in dataset: 356


Unnamed: 0,act,prompt
0,Lunatic,I want you to act as a lunatic. The lunatic's ...
1,Fill in the Blank Worksheets Generator,I want you to act as a fill in the blank works...
2,Personal Shopper,I want you to act as my personal shopper. I wi...
3,Math Teacher,I want you to act as a math teacher. I will pr...
4,Statistician,I want to act as a Statistician. I will provid...


In [3]:
# Fill any missing values
df["act"] = df["act"].fillna("")
df["prompt"] = df["prompt"].fillna("")

# Make sure text is string type
df["act"] = df["act"].astype(str)
df["prompt"] = df["prompt"].astype(str)

print("Any missing values?\n", df.isnull().sum())

Any missing values?
 act       0
prompt    0
dtype: int64


In [4]:
# Use only the "prompt" column for training
prompts = df["prompt"].tolist()

# Convert text to numeric vectors
vectorizer = TfidfVectorizer(stop_words="english")
X = vectorizer.fit_transform(prompts)

print("Shape of TF-IDF Matrix:", X.shape)

Shape of TF-IDF Matrix: (356, 2669)


In [5]:
def suggest_prompt(user_input, top_n=1):
    # Convert user input to vector
    user_vec = vectorizer.transform([user_input])
    
    # Compare with dataset
    similarities = cosine_similarity(user_vec, X).flatten()
    
    # Get top N results
    top_indices = similarities.argsort()[-top_n:][::-1]
    results = [(prompts[i], similarities[i]) for i in top_indices]
    
    return results

In [7]:
import pickle

# Save vectorizer and dataset
with open("vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

df.to_csv("combined_prompts.csv", index=False)