In [1]:
import pandas as pd
import nltk
import spacy

# Getting data ready

## Stripping white space

In [None]:
df = pd.read_csv("articles_combined_all_1.csv")
df["Title"] = df["Title"].str.strip()
df["Content"] = df["Content"].str.strip()

## Lowercase text

In [36]:
df["Content"] = df["Content"].str.lower()
df.head()

Unnamed: 0,Title,Content
0,How cold affects muscles,"when the mercury drops, it threatens your body..."
1,Two types of stretches,dr. eby recommends two kinds of stretches to w...
2,What else you can do,while stretching might be enough to avoid stif...
3,If pain doesn't go away,"if muscle pain doesn't go away, you might have..."
4,Why is exercise so important for seniors?,whether you were once much more physically act...


## Remove punctuation

In [None]:
import string

def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

df['Content'] = df['Content'].apply(remove_punctuation)

df

Unnamed: 0,Title,Content
0,How cold affects muscles,when the mercury drops it threatens your bodys...
1,Two types of stretches,dr eby recommends two kinds of stretches to wa...
2,What else you can do,while stretching might be enough to avoid stif...
3,If pain doesn't go away,if muscle pain doesnt go away you might have a...
4,Why is exercise so important for seniors?,whether you were once much more physically act...
5,What are the best types of exercise?,while there are endless forms ofexercise exper...
6,How much exercise do I need?,how much exercise you should be getting depend...
7,What are the benefits of exercise?,a smartly designed exercise program will benef...
8,What if my exercise ability is limited?,everyone can and should do some form of exerci...
9,What exercises are best for heart health?,the best exercise program will incorporate bot...


## Remove special characters

In [8]:
import re

def remove_special_characters(text):
    return re.sub(r'[^a-zA-Z0-9\s]', '', text)

df['Content'] = df['Content'].apply(remove_special_characters)


In [9]:
df

Unnamed: 0,Title,Content
0,How cold affects muscles,mercury drops threatens bodys core temperature...
1,Two types of stretches,Dr Eby recommends two kinds stretches ward col...
2,What else you can do,stretching might enough avoid stiff winter mus...
3,If pain doesn't go away,muscle pain go away might injury muscle strain...
4,Why is exercise so important for seniors?,Whether much physically active never one exerc...
...,...,...
89,What are the side effects?,antiobesity diabetes formulations potential si...
90,Who's a candidate for the drugs?,drugs approved weight loss people diagnosed ob...
91,How long do you take the drugs?,Taking one new GLP1s shortterm solution go dru...
92,Costs and coverage,Spectacular results come cheaply new drugs ran...


## Remove stopwords

In [5]:
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Remove stopwords
def remove_stopwords(text):
    words = text.split()
    return ' '.join([word for word in words if word.lower() not in stop_words])

df['Content'] = df['Content'].apply(remove_stopwords)

df


[nltk_data] Downloading package stopwords to /Users/sb/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,Title,Content
0,How cold affects muscles,"mercury drops, threatens body's core temperatu..."
1,Two types of stretches,Dr. Eby recommends two kinds stretches ward co...
2,What else you can do,stretching might enough avoid stiff winter mus...
3,If pain doesn't go away,"muscle pain go away, might injury, muscle stra..."
4,Why is exercise so important for seniors?,Whether much physically active never one exerc...
...,...,...
89,What are the side effects?,anti-obesity diabetes formulations potential s...
90,Who's a candidate for the drugs?,drugs approved weight loss people diagnosed ob...
91,How long do you take the drugs?,Taking one new GLP-1s short-term solution. go ...
92,Costs and coverage,"Spectacular results come cheaply, new drugs ra..."


## Stemming

In [None]:
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')  
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package wordnet to /Users/sb/nltk_data...
[nltk_data] Downloading package omw-1.4 to /Users/sb/nltk_data...
[nltk_data] Downloading package punkt to /Users/sb/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/sb/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [None]:
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

# Initialize stemmer
stemmer = PorterStemmer()

def apply_stemming(text):
    words = text.split()
    return ' '.join([stemmer.stem(word) for word in words])

# Apply stemming
df['Content'] = df['Content'].apply(apply_stemming)

df.head()

Unnamed: 0,Title,Content
0,How cold affects muscles,mercuri drop threaten bodi core temperatur fun...
1,Two types of stretches,dr ebi recommend two kind stretch ward cold we...
2,What else you can do,stretch might enough avoid stiff winter muscl ...
3,If pain doesn't go away,muscl pain doesnt go away might injuri muscl s...
4,Why is exercise so important for seniors?,whether much physic activ never one exercis re...


# Generating embeddings

In [7]:
import os
import pandas as pd
import openai
import pinecone
from tqdm.notebook import tqdm

In [None]:
"""
    Generates a high-dimensional embedding for the given text using OpenAI's embedding model.

    Parameters:
        text: Input text to embed.
        model: Embedding model name (default is "text-embedding-3-small").

    Returns:
        A list representing the embedding vector.
 """

def get_embedding(text, model="text-embedding-3-small"):
    response = openai.embeddings.create(
        input=text,
        model=model
    )
    embedding = response.data[0].embedding
    return embedding

In [45]:
tqdm.pandas()

df['embedding'] = df['Content'].progress_apply(lambda x: get_embedding(x))

  0%|          | 0/10 [00:00<?, ?it/s]

In [48]:
df

Unnamed: 0,Title,Content,embedding
0,How cold affects muscles,mercuri drop threaten bodi core temperatur fun...,"[0.042666640132665634, 0.03674071654677391, -0..."
1,Two types of stretches,dr ebi recommend two kind stretch ward cold we...,"[0.01926954835653305, 0.013544243760406971, -0..."
2,What else you can do,stretch might enough avoid stiff winter muscl ...,"[0.015518976375460625, 0.016931038349866867, -..."
3,If pain doesn't go away,muscl pain doesnt go away might injuri muscl s...,"[0.02554585598409176, 0.030388735234737396, -0..."
4,Why is exercise so important for seniors?,whether much physic activ never one exercis re...,"[0.018998412415385246, 0.044500019401311874, -..."


## Init Pinecone

In [49]:
pc = pinecone.Pinecone(api_key=pinecone_api_key)

In [50]:
index_name = 'gen-fit'

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=1536,
        metric='cosine',
        spec=pinecone.ServerlessSpec(
            cloud='aws',
            region='us-east-1'
        )
    )

index = pc.Index(index_name)

In [52]:
# Convert embeddings to the required format
pinecone_data = [
    (
        str(i),  # Unique ID for each vector
        df.iloc[i]['embedding'],  # Embedding vector
        {
            'Topic': df.iloc[i]['Title'],
            'Content': df.iloc[i]['Content']
        }  # Metadata
    )
    for i in range(len(df))
]

In [54]:
# Define batch size
batch_size = 100

# Upsert in batches
for i in tqdm(range(0, len(pinecone_data), batch_size)):
    batch = pinecone_data[i:i+batch_size]
    index.upsert(vectors=batch)

  0%|          | 0/1 [00:00<?, ?it/s]