In [1]:
%load_ext dotenv
%dotenv

In [2]:
import pandas as pd
import os
from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv, find_dotenv
import pinecone
from sentence_transformers import SentenceTransformer

In [3]:
files = pd.read_csv("course_section_descriptions.csv", encoding= "ANSI")

In [4]:
files.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 680 entries, 0 to 679
Data columns (total 11 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   course_id                 680 non-null    int64 
 1   course_name               680 non-null    object
 2   course_slug               680 non-null    object
 3   course_description        680 non-null    object
 4   course_description_short  680 non-null    object
 5   course_technology         680 non-null    object
 6   course_topic              680 non-null    object
 7   course_instructor_quote   660 non-null    object
 8   section_id                680 non-null    int64 
 9   section_name              680 non-null    object
 10  section_description       680 non-null    object
dtypes: int64(2), object(9)
memory usage: 58.6+ KB


In [5]:
files.head()

Unnamed: 0,course_id,course_name,course_slug,course_description,course_description_short,course_technology,course_topic,course_instructor_quote,section_id,section_name,section_description
0,2,Introduction to Tableau,tableau,Tableau is now one of the most popular busines...,Teaching you how to tell compelling stories wi...,tableau,data visualization,Data scientists don’t just need to deal with d...,9,Introduction to Tableau,While Tableau is an indispensable tool in the ...
1,2,Introduction to Tableau,tableau,Tableau is now one of the most popular busines...,Teaching you how to tell compelling stories wi...,tableau,data visualization,Data scientists don’t just need to deal with d...,10,Tableau Functionalities,"In this section, you will create your first Ta..."
2,2,Introduction to Tableau,tableau,Tableau is now one of the most popular busines...,Teaching you how to tell compelling stories wi...,tableau,data visualization,Data scientists don’t just need to deal with d...,11,The Tableau Exercise,This section is a practical example that will ...
3,3,The Complete Data Visualization Course with Py...,data-visualization,The Data Visualization course is designed for ...,Teaching you how to master the art of creating...,python,data visualization,Data visualization is the face of data. Many p...,12,Introduction,"In this section, you will learn about the impo..."
4,3,The Complete Data Visualization Course with Py...,data-visualization,The Data Visualization course is designed for ...,Teaching you how to master the art of creating...,python,data visualization,Data visualization is the face of data. Many p...,13,Setting Up the Environments,"Here, we set up different environments for the..."


In [6]:
files["unique_id"] = files["course_id"].astype(str) + ' - ' + files["section_id"] .astype(str)

In [7]:
files["metadata"] = files.apply(lambda row: {
    "course_name" : row["course_name"],
    "section_name" : row["section_name"],
    "section_description" : row["section_description"]
}, axis= 1)

In [8]:
files["metadata"].head()

0    {'course_name': 'Introduction to Tableau', 'se...
1    {'course_name': 'Introduction to Tableau', 'se...
2    {'course_name': 'Introduction to Tableau', 'se...
3    {'course_name': 'The Complete Data Visualizati...
4    {'course_name': 'The Complete Data Visualizati...
Name: metadata, dtype: object

In [10]:
def create_embedding(row):
    combined_text = f'''{row["course_name"]}
                        {row["course_technology"]}
                        {row["course_description"]}
                        {row["section_name"]}
                        {row["section_description"]}
                     '''
    return model.encode(combined_text, show_progress_bar = False)

In [11]:
model = SentenceTransformer("all-MiniLM-L6-v2")

In [12]:
files["embedding"] = files.apply(create_embedding, axis= 1)

In [14]:
load_dotenv(find_dotenv(), override= True)

True

In [15]:
pc = Pinecone(api_key= os.environ.get("PINECONE_API_KEY "), environment = os.environ.get("PINECONE_ENV"))

In [19]:
pc.list_indexes()

[
    {
        "name": "my-index",
        "metric": "cosine",
        "host": "my-index-03idc8k.svc.aped-4627-b74a.pinecone.io",
        "spec": {
            "serverless": {
                "cloud": "aws",
                "region": "us-east-1"
            }
        },
        "status": {
            "ready": true,
            "state": "Ready"
        },
        "vector_type": "dense",
        "dimension": 384,
        "deletion_protection": "disabled",
        "tags": null
    },
    {
        "name": "text",
        "metric": "cosine",
        "host": "text-03idc8k.svc.aped-4627-b74a.pinecone.io",
        "spec": {
            "serverless": {
                "cloud": "aws",
                "region": "us-east-1"
            }
        },
        "status": {
            "ready": true,
            "state": "Ready"
        },
        "vector_type": "dense",
        "dimension": 384,
        "deletion_protection": "disabled",
        "tags": null
    },
    {
        "name": "my-index-3"

In [17]:
index_name = "my-index"
dimension = 384
metric = "cosine"

In [20]:
if index_name in [index.name for index in pc.list_indexes()]:
    pc.delete_index(index_name)
    print(f"{index_name} successfully deleted")
else :
    print(f"{index_name}  not in index list")

my-index successfully deleted


In [21]:
pc.create_index(
    name= index_name,
    dimension= dimension,
    metric= metric,
    spec= ServerlessSpec(
        cloud= "aws",
        region= "us-east-1"
    )
)

{
    "name": "my-index",
    "metric": "cosine",
    "host": "my-index-03idc8k.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 384,
    "deletion_protection": "disabled",
    "tags": null
}

In [22]:
index = pc.Index(index_name)

In [23]:
vectors_to_upsert = [(row["unique_id"], row["embedding"].tolist(), row["metadata"]) for index, row in files.iterrows()]

In [24]:
index.upsert(vectors= vectors_to_upsert)
print("Data successfully upserted to Pinecone index")

Data successfully upserted to Pinecone index


In [25]:
query = "clustering"
query_embedding = model.encode(query,  show_progress_bar= False).tolist()

In [27]:
query_results = index.query(
    vector= [query_embedding],
    top_k= 12,
    include_metadata= True
)

In [28]:
score_threshold = 0.3

In [29]:
# Assuming query_results are fetched and include metadata
for match in query_results['matches']:
    if match['score'] >= score_threshold:
        course_details = match.get('metadata', {})
        course_name = course_details.get('course_name', 'N/A')
        section_name = course_details.get('section_name', 'N/A')
        section_description = course_details.get('section_description', 'No description available')
        
        print(f"Matched item ID: {match['id']}, Score: {match['score']}")
        print(f"Course: {course_name} \nSection: {section_name} \nDescription: {section_description}")

Matched item ID: 51 - 469, Score: 0.545678616
Course: Machine Learning in Excel 
Section: Cluster Analysis 
Description: Cluster analysis is the most intuitive and important example of unsupervised learning. However, to be able to understand cluster analysis, you must first become familiar with the mathematics behind it. Here we will explore the fundamentals of cluster analysis and have a look at the differences between clustering and classification.
Matched item ID: 37 - 374, Score: 0.542693496
Course: Machine Learning in Python 
Section: Other Types of Clustering 
Description: In previous sections, we focus extensively on k-means clustering, as it is the fastest and most efficient method for clustering. In this section, we explore other approaches that are less common.
Matched item ID: 51 - 470, Score: 0.506335199
Course: Machine Learning in Excel 
Section: K-means Clustering 
Description: Master K-means clustering in Excel by learning how to choose the number of clusters in your ana

In [30]:
index_name = "bert"
dimension = 768
metric = "cosine"

In [31]:
if index_name in [index.name for index in pc.list_indexes()]:
    pc.delete_index(index_name)
    print(f"{index_name} successfully deleted")
else :
    print(f"{index_name}  not in index list")

bert  not in index list


In [32]:
pc.create_index(
    name= index_name,
    dimension= dimension,
    metric= metric,
    spec= ServerlessSpec(
        cloud= "aws",
        region= "us-east-1"
    )
)

{
    "name": "bert",
    "metric": "cosine",
    "host": "bert-03idc8k.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 768,
    "deletion_protection": "disabled",
    "tags": null
}

In [33]:
pc.list_indexes()

[
    {
        "name": "text",
        "metric": "cosine",
        "host": "text-03idc8k.svc.aped-4627-b74a.pinecone.io",
        "spec": {
            "serverless": {
                "cloud": "aws",
                "region": "us-east-1"
            }
        },
        "status": {
            "ready": true,
            "state": "Ready"
        },
        "vector_type": "dense",
        "dimension": 384,
        "deletion_protection": "disabled",
        "tags": null
    },
    {
        "name": "my-index",
        "metric": "cosine",
        "host": "my-index-03idc8k.svc.aped-4627-b74a.pinecone.io",
        "spec": {
            "serverless": {
                "cloud": "aws",
                "region": "us-east-1"
            }
        },
        "status": {
            "ready": true,
            "state": "Ready"
        },
        "vector_type": "dense",
        "dimension": 384,
        "deletion_protection": "disabled",
        "tags": null
    },
    {
        "name": "bert",
    

In [34]:
query = "regression in Python"
query_embedding = model.encode(query, show_progress_bar=False).tolist()

In [35]:
query_results = index.query(
    vector = [query_embedding],
    top_k = 12,
    include_metadata=True
)

In [36]:
score_threshold = 0.4

In [37]:
# Assuming query_results are fetched and include metadata
for match in query_results['matches']:
    if match['score'] >= score_threshold:
        course_details = match.get('metadata', {})
        course_name = course_details.get('course_name', 'N/A')
        section_name = course_details.get('section_name', 'N/A')
        section_description = course_details.get('section_description', 'No description available')
        
        print(f"Matched item ID: {match['id']}, Score: {match['score']}")
        print(f"Course: {course_name} \nSection: {section_name} \nDescription: {section_description}")

Matched item ID: 37 - 369, Score: 0.743426919
Course: Machine Learning in Python 
Section: Linear Regression with sklearn 
Description: While there are many libraries that can compute a regression model, the most numerically stable one is sklearn. It is also the preferred choice of many machine learning professionals. In this section, we implement all we know about regressions in this amazing library.
Matched item ID: 36 - 363, Score: 0.668548405
Course: Python for Finance 
Section: Using Regressions for Financial Analysis 
Description: Understanding rates of return and risk is not all there is about finance. Working with regression analysis is a must, and you will see that Python only helps you to be quicker and more precise when doing such estimations.
Matched item ID: 37 - 368, Score: 0.645793319
Course: Machine Learning in Python 
Section: Linear Regression 
Description: In this part of the course, we will discuss what the course covers, why you need to learn advanced statistics, w

In [38]:
index = pc.Index(index_name)