# Nyarch Linux Smart Prompts trainer

## Settings

In [47]:
VERSION = 0.2
EMBEDDINGS_SIZE = 1024
NAME = "NyaMedium"
FILENAME = NAME + '_' + str(VERSION) + '.pkl'
DATASET_URL = 'https://raw.githubusercontent.com/NyarchLinux/Smart-Prompts/refs/heads/main/dataset.csv'

## Dataset Management

In [48]:
import csv

def save_dataset_to_csv(dataset, filename):
  """Saves a dataset dictionary to a CSV file.

  Args:
    dataset: A dictionary where keys are labels and values are lists of prompts.
    filename: The name of the CSV file to save.
  """

  with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Prompt', 'Label'])  # Header row

    for label, prompts in dataset.items():
      for prompt in prompts:
        writer.writerow([prompt, label])

In [49]:
import csv

def reconstruct_dataset_from_csv(filename):
  """Reconstructs a dataset dictionary from a CSV file.

  Args:
    filename: The name of the CSV file to load.

  Returns:
    A dictionary where keys are labels and values are lists of prompts.
  """

  dataset = {}
  with open(filename, 'r', newline='', encoding='utf-8') as csvfile:
    reader = csv.reader(csvfile)
    next(reader)  # Skip the header row

    for row in reader:
      prompt, label = row
      if label not in dataset:
        dataset[label] = []
      dataset[label].append(prompt)

  return dataset

## Additional downloads

In [50]:
!pip install -U wordllama



## Load the dataset

In [51]:
import subprocess
subprocess.check_output(["wget", "-O", "dataset.csv", DATASET_URL])

b''

In [52]:
DATASET = reconstruct_dataset_from_csv('dataset.csv')

## Dataset preparation and split

In [53]:
from wordllama import WordLlama
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Conver the dataset into a label list
texts = []
labels = []
for label, questions in DATASET.items():
    texts.extend(questions)
    labels.extend([label] * len(questions))

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Start the label encoder
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

wl = WordLlama.load(dim=EMBEDDINGS_SIZE)

# Embend all the prompts
X_train_embeddings = wl.embed(X_train)
X_test_embeddings = wl.embed(X_test)


## Model Training

In [54]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Create an SVC classifier
classifier = LogisticRegression(random_state=22)

# Train the model
classifier.fit(X_train_embeddings, y_train_encoded)

# Predict the test dataset
y_pred = classifier.predict(X_test_embeddings)

# Evaluate the model
print(classification_report(y_test_encoded, y_pred, target_names=label_encoder.classes_))


              precision    recall  f1-score   support

      codecs       1.00      1.00      1.00        19
  colloquial       0.97      0.97      0.97        35
     console       0.97      0.97      0.97        30
      docker       0.96      1.00      0.98        23
      nvidia       1.00      1.00      1.00        47
      ollama       1.00      0.88      0.93        16
       table       0.97      1.00      0.98        28
    voicevox       1.00      1.00      1.00        11

    accuracy                           0.98       209
   macro avg       0.98      0.98      0.98       209
weighted avg       0.98      0.98      0.98       209



## Export the model and use it

In [55]:
# Save the model

import pickle

pickle.dump(classifier, open(FILENAME, 'wb'))


Load and test the model

In [56]:
import pickle
from wordllama import WordLlama

# Load the model
loaded_model = pickle.load(open(FILENAME, 'rb'))

# Load wordllama
wl = WordLlama.load(dim=EMBEDDINGS_SIZE)

In [58]:
# Sentences to classify
new_texts = ["How do I install proprietary nvidia drivers"]

# Embed new sentences
new_embeddings = wl.embed(new_texts)

# Get the probabilities
new_probabilities = classifier.predict_proba(new_embeddings)

# Print the probabilities
labels = list(DATASET.keys())
labels.sort()
for i, text in enumerate(new_texts):
    print(f"Text: '{text}'")
    for j, category in enumerate(labels):
        print(f"  {category}: {new_probabilities[i][j]:.4f}")
    print("Category:" + label_encoder.classes_[np.argmax(new_probabilities[i])])
    print("-" * 30)


Text: 'How do I install proprietary nvidia drivers'
  codecs: 0.0005
  colloquial: 0.0000
  console: 0.0006
  docker: 0.0001
  nvidia: 0.9983
  ollama: 0.0001
  table: 0.0000
  voicevox: 0.0004
Category:nvidia
------------------------------
