In [1]:
import pandas as pd
import os
from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv, find_dotenv
import pinecone
from sentence_transformers import SentenceTransformer

In [2]:
files = pd.read_csv("course_descriptions.csv", encoding= "ANSI")

In [3]:
files.head()

Unnamed: 0,course_name,course_slug,course_technology,course_description,course_topic,course_description_short
0,Introduction to Tableau,tableau,tableau,Tableau is now one of the most popular busines...,data visualization,Teaching you how to tell compelling stories wi...
1,The Complete Data Visualization Course with Py...,data-visualization,python,The Data Visualization course is designed for ...,data visualization,Teaching you how to master the art of creating...
2,Introduction to R Programming,introduction-to-r-programming,r,R is one of the best programming languages spe...,programming,"Providing you with the skills to manipulate, a..."
3,Data Preprocessing with NumPy,data-preprocessing-numpy,python,This course is designed to show you how to wor...,data processing,This course will guide you through one of Pyth...
4,Introduction to Data and Data Science,intro-to-data-and-data-science,theory,Working with data is an essential part of main...,machine learning,Introducing you to the field of data science a...


In [4]:
files.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 106 entries, 0 to 105
Data columns (total 6 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   course_name               106 non-null    object
 1   course_slug               106 non-null    object
 2   course_technology         106 non-null    object
 3   course_description        106 non-null    object
 4   course_topic              106 non-null    object
 5   course_description_short  106 non-null    object
dtypes: object(6)
memory usage: 5.1+ KB


In [5]:
def create_course_description(row):
    return f'''The course name is {row["course_name"]}, the slug is {row["course_slug"]},
            the technology is {row["course_technology"]} and the course topic is {row["course_topic"]}'''

In [6]:
files['course_description_new'] = files.apply(create_course_description, axis = 1)
print(files["course_description_new"])

0      The course name is Introduction to Tableau, th...
1      The course name is The Complete Data Visualiza...
2      The course name is Introduction to R Programmi...
3      The course name is Data Preprocessing with Num...
4      The course name is Introduction to Data and Da...
                             ...                        
101    The course name is Intro to NLP for AI, the sl...
102    The course name is Data Analysis with ChatGPT,...
103    The course name is ChatGPT for Data Science, t...
104    The course name is Intro to LLMs, the slug is ...
105    The course name is Growth Analysis with SQL, P...
Name: course_description_new, Length: 106, dtype: object


In [7]:
files['course_description_new'][0]

'The course name is Introduction to Tableau, the slug is tableau,\n            the technology is tableau and the course topic is data visualization'

In [8]:
%load_ext dotenv
%dotenv

In [9]:
load_dotenv(find_dotenv(), override= True)

True

In [10]:
pc = Pinecone(api_key= os.environ.get("PINECONE_API_KEY"), environment = os.environ.get("PINECONE_ENV"))

In [11]:
index_name = "my-index"
dimension = 384
metric = "cosine"

In [12]:
if index_name in [index.name for index in pc.list_indexes()]:
    pc.delete_index(index_name)
    print(f"{index_name} successfully delete")
else :
    print(f"{index_name} not in index list")

my-index successfully delete


In [13]:
pc.create_index(
    name= index_name,
    dimension= dimension,
    metric= metric,
    spec= ServerlessSpec(
        cloud= "aws",
        region= "us-east-1"
    )
)

{
    "name": "my-index",
    "metric": "cosine",
    "host": "my-index-03idc8k.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 384,
    "deletion_protection": "disabled",
    "tags": null
}

In [14]:
index = pc.Index(index_name)

In [15]:
model = SentenceTransformer("all-MiniLM-L6-v2")

In [16]:
def create_embedding(row):
    combined_text = ' '.join(str(row[field]) for field in ['course_description', 'course_description_new', 'course_description_short'])
    embedding = model.encode(combined_text, show_progress_bar= False)
    return embedding

In [17]:
files["embedding"] = files.apply(create_embedding, axis = 1)

In [18]:
vectors_to_upsert = [(str(row["course_name"]), row["embedding"].tolist()) for _, row in files.iterrows()]
index.upsert(vectors = vectors_to_upsert)
print("Data upsert to Pinecone index")

Data upsert to Pinecone index


In [19]:
query = "clustering"
query_embedding = model.encode(query, show_progress_bar=False).tolist()

In [20]:
query_results = index.query(
    vector= [query_embedding],
    top_k= 12,
    include_metadata= True
)

In [21]:
query_results

{'matches': [], 'namespace': '', 'usage': {'read_units': 1}}

In [22]:
score_threshold = 0.3
for match in query_results["matches"]:
    if match['score'] >= score_threshold:
        print(f"Matched item ID: {match['id']}, score: {match['score']}")