## **Required Package**

In [None]:
!pip install -q datasets
!huggingface-cli login
!pip install pandas nltk


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


# **DataBase**

In [None]:
from datasets import load_dataset
import re
dataset = load_dataset('practical-dreamer/RPGPT_PublicDomain-alpaca')
dataset

DatasetDict({
    train: Dataset({
        features: ['instruction', 'output', 'input'],
        num_rows: 4260
    })
})

In [None]:
dataset = dataset['train'].shuffle(seed=42).select(range(1000))

In [None]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

def filter_instructions(row, min_tokens=100):
    instruction = row['instructions']
    tokens = word_tokenize(instruction)
    if len(tokens) >= min_tokens:
        return instruction
    else:
        return None

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
def count_tokens(text):
    return len(text.split())

filtered_dataset = dataset.filter(lambda example: count_tokens(example['instruction']) >= 100)

print(filtered_dataset['train'][:5])

{'instruction': [], 'output': [], 'input': []}


In [None]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import DBSCAN

In [None]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(dataset)

In [None]:
cosine_sim_matrix = cosine_similarity(tfidf_matrix)

In [None]:
dbscan = DBSCAN(eps=0.05, min_samples=1, metric="precomputed")
clusters = dbscan.fit_predict(1 - cosine_sim_matrix)

In [None]:
from collections import defaultdict

grouped_dataset = defaultdict(list)
for idx, label in enumerate(clusters):
    grouped_dataset[label].append(dataset['train'][idx])

for group, texts in grouped_dataset.items():
    print(f"Group {group}:")
    for text in texts:
        print(f" - {text}")
    print()


Group 0:
 - {'instruction': 'Write a character roleplay dialogue using asterisk roleplay format based on the following character descriptions and scenario. (Each line in your response must be from the perspective of one of these characters)', 'output': 'Mr. Edward Rochester: *leaning forward, eyes locked on the auctioneer* I trust you know the importance of that painting to my family.\n\nMary Leavenworth: *glancing sideways, coolly* Importance? It holds a significant value to my family as well, Mr. Rochester. What makes your claim stronger than mine?\n\nMr. Edward Rochester: *eyes narrowing, voice firm* The Rochesters have a long and storied connection to the artwork in question. It is a testament to our history and holds secrets crucial to our family\'s legacy.\n\nMary Leavenworth: *lifting an eyebrow, challenging* Surely you know that my ancestors played an integral part in its creation? I must ensure that it remains within the Leavenworth family, where it rightfully belongs.\n\nMr. 

In [None]:
from collections import defaultdict

grouped_dataset = defaultdict(list)

if len(filtered_dataset['train']) > 0:
    for idx, label in enumerate(clusters):
        grouped_dataset[label].append(filtered_dataset['train'][idx])

for group, texts in grouped_dataset.items():
    print(f"Group {group}:")
    for text in texts:
        print(f" - {text}")
    print()

In [None]:
grouped_dataset

defaultdict(list, {})

In [None]:
filtered_dataset


DatasetDict({
    train: Dataset({
        features: ['instruction', 'output', 'input'],
        num_rows: 0
    })
})

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['instruction', 'output', 'input'],
        num_rows: 4260
    })
})

# **Upload on Hugging Face**

In [None]:
from huggingface_hub import login
from datasets import Dataset

In [None]:
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
dataset.push_to_hub("priyanshu03/sample")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/353 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/priyanshu03/sample/commit/1bc85c670fba8d5781f744fa75d86906a0e4516d', commit_message='Upload dataset', commit_description='', oid='1bc85c670fba8d5781f744fa75d86906a0e4516d', pr_url=None, pr_revision=None, pr_num=None)