In [None]:
!pip -Qu install datasets transformers

# Loading the IMDB dataset

In [None]:
from datasets import load_dataset
imdb_dataset = load_dataset("imdb")

# Tokenize imdb Dataset

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer

# Load a small portion of the IMDB dataset (100 samples)
imdb_dataset = load_dataset("imdb", split="train[:100]")

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Tokenize the IMDB dataset with truncation and padding
tokenized_imdb_dataset = imdb_dataset.map(
lambda x: tokenizer(x["text"], truncation=True, padding="max_length")
)

print(tokenized_imdb_dataset)

# Get the first row of tokens
first_row_tokens = tokenized_imdb_dataset[0]["input_ids"]

# Print the first 10 tokens and their corresponding words
for token in first_row_tokens[:10]:
  print(f"Token: {token}, Word: {tokenizer.decode([token])}")

# Spaces example code
Set up Gradio interface

In [None]:
!pip install gradio transformers

In [None]:
import gradio as gr
from transformers import pipeline

sentiment_pipeline = pipeline("sentiment-analysis")

def sentiment_analysis(text):
    result = sentiment_pipeline(text)
    return result[0]["label"]

iface = gr.Interface(fn=sentiment_analysis, inputs="text", outputs="text")
iface.launch()

# Elastic Eland

In [None]:
!pip install eland

# TODO ADD INDEX CREATION EXAMPLE

## Connect to Elasticsearch and create sample index

In [None]:
import getpass
from datetime import datetime

es_cloud_id = getpass.getpass('Enter Elastic Cloud ID:  ')
es_api_key = getpass.getpass('Enter cluster API key:  ')

es = Elasticsearch(cloud_id=es_cloud_id,
                   api_key=es_api_key
                   )
es.info() # should return cluster info

mapping = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "some_field": {"type": "float"},
            "column_a": {"type": "float"},
            "column_b": {"type": "float"},
            "category": {"type": "keyword"},
            "value": {"type": "float"}
        }
    }
}

# Create the index
es.indices.create(index="sample_eland_index", body=mapping)

# Populate the index with a small dataset
documents = [
    {"some_field": 95.0, "column_a": 5.0, "column_b": 10.0, "category": "A", "value": 50.0},
    {"some_field": 150.0, "column_a": 7.0, "column_b": 20.0, "category": "B", "value": 140.0},
    {"some_field": 200.0, "column_a": 8.0, "column_b": 25.0, "category": "A", "value": 200.0},
    {"some_field": 50.0, "column_a": 4.0, "column_b": 12.5, "category": "C", "value": 50.0}
]

for doc in documents:
    es.index(index="sample_eland_index", body=doc)



## Eland Examples

In [None]:
import eland as ed

df = ed.DataFrame(es_client=es, es_index_pattern="sample_eland_index")
filtered_df = df[df['some_field'] > 100]
filtered_df

In [None]:
average_value = df['some_field'].mean()
average_value

In [None]:
import seaborn as sns
import pandas as pd

filtered_df = df[df['some_field'] > 100]
pandas_df = filtered_df.to_pandas()
sns.boxplot(x='category', y='value', data=pandas_df)

# Loading a Sentence Transformer from Hugging Face into Elasticsearch

In [None]:
pip -q install eland elasticsearch transformers sentence_transformers torch==1.13

In [None]:
from pathlib import Path
from eland.ml.pytorch import PyTorchModel
from eland.ml.pytorch.transformers import TransformerModel
from elasticsearch import Elasticsearch
from elasticsearch.client import MlClient

In [None]:
import getpass

In [None]:
es_cloud_id = getpass.getpass('Enter Elastic Cloud ID:  ')
es_api_key = getpass.getpass('Enter cluster API key:  ')

In [None]:
es = Elasticsearch(cloud_id=es_cloud_id,
                   api_key=es_api_key
                   )
es.info() # should return cluster info

In [None]:
hf_model_id='sentence-transformers/msmarco-MiniLM-L-12-v3'
tm = TransformerModel(hf_model_id, "text_embedding")

In [None]:
es_model_id = tm.elasticsearch_model_id()
es_model_id

In [None]:
tmp_path = "models"
Path(tmp_path).mkdir(parents=True, exist_ok=True)
model_path, config, vocab_path = tm.save(tmp_path)

In [None]:
ptm = PyTorchModel(es, es_model_id)
ptm.import_model(model_path=model_path, config_path=None, vocab_path=vocab_path, config=config)

In [None]:
# List the in Elasticsearch
m = MlClient.get_trained_models(es, model_id=es_model_id)
m.body

In [None]:
s = MlClient.start_trained_model_deployment(es, model_id=es_model_id)
s.body

In [None]:
stats = MlClient.get_trained_models_stats(es, model_id=es_model_id)
stats.body['trained_model_stats'][0]['deployment_stats']['nodes'][0]['routing_state']

In [None]:
docs =  [
    {
      "text_field": "Last week I upgraded my iOS version and ever since then my phone has been overheating whenever I use your app."
    }
  ]

In [None]:
z = MlClient.infer_trained_model(es, model_id=es_model_id, docs=docs, )

In [None]:
doc_0_vector = z['inference_results'][0]['predicted_value']
doc_0_vector

# Reducing Dimensionality

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.decomposition import PCA

In [None]:
# Load the Iris dataset
iris = datasets.load_iris()
X = iris.data
y = iris.target

# Apply PCA for dimensionality reduction
pca = PCA(n_components=2)
X_reduced = pca.fit_transform(X)

In [None]:
# Visualize the original data
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Set1, edgecolor='k')
plt.xlabel('Sepal length')
plt.ylabel('Sepal width')
plt.title('Original Iris dataset')
plt.show()

In [None]:
# Visualize the reduced data
plt.scatter(X_reduced[:, 0], X_reduced[:, 1], c=y, cmap=plt.cm.Set1, edgecolor='k')
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
plt.title('Iris dataset after PCA')
plt.show()


#Quantization

In [None]:
import numpy as np
from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, QuantileTransformer


In [None]:
# Load the digits dataset
digits = datasets.load_digits()
X = digits.data

# Print the first example from the original dataset
print("Original dataset (first example):\n", X[0])

In [None]:
# Apply PCA for dimensionality reduction
pca = PCA(n_components=10)
X_reduced = pca.fit_transform(X)

# Print the first example after PCA
print("\nReduced dataset after PCA (first example):\n", X_reduced[0])

In [None]:
# Normalize the reduced vectors to the range [0, 255]
scaler = MinMaxScaler((0, 255))
X_scaled = scaler.fit_transform(X_reduced)

# Print the first example after normalization
print("\nScaled dataset after normalization (first example):\n", X_scaled[0])

In [None]:
# Quantize the scaled vectors to 8-byte integers
X_quantized = np.round(X_scaled).astype(np.uint8)

# Print the first example after quantization
print("\nQuantized dataset (first example):\n", X_quantized[0])