In [6]:
import pandas as pd
import google.generativeai as genai
import json
from secret_key import GOOGLE_API_KEY
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [2]:
genai.configure(api_key=GOOGLE_API_KEY)

In [None]:
csv_file = "cleaned_programs.csv"
output_filename = "../dataset/embedded_programs.json"

In [4]:
print(f"Reading and chunking data from '{csv_file}")
try:
    df = pd.read_csv(csv_file)
    full_text = df.apply(lambda row: f"College: {row['college_name']}. Program Name: {row['program_name']}. Duration: {row['duration']}", axis = 1).str.cat(sep='\n')
except FileNotFoundError:
    print(f"error, {csv_file} not found")
    full_text = None


Reading and chunking data from 'cleaned_programs.csv


In [17]:
chunks = []
if full_text:
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = 1000,
        chunk_overlap = 100,
        separators= ["\n\n", "\n", ".", "!"]
    )
    chunks = text_splitter.split_text(full_text)
    print(f"Done, created {len(chunks)} chunks")

Done, created 77 chunks


In [18]:
# Generating embeddings

embedded_chunks = []
if chunks:
    print("Starting Embeding process for all chunks")
    for i, chunk in enumerate(chunks):
        embedded = genai.embed_content(
            model="models/embedding-001",
            content = chunk,
            task_type= "SEMANTIC_SIMILARITY"
        )['embedding']

        # Storing the chunk text and the embedding
        embedded_chunks.append({
            "chunk_text": chunk,
            "embedding": embedded
        })

    print(f"Embedding done, {len(embedded_chunks)} chunks embedded")

Starting Embeding process for all chunks
Embedding done, 77 chunks embedded


In [19]:
if embedded_chunks:
    with open(output_filename, 'w') as f:
        json.dump(embedded_chunks, f, indent = 4)
    print(f"\n Successfully saved chunks to {output_filename}")

else:
    print("OH NO")


 Successfully saved chunks to ../dataset/embedded_programs.json


In [20]:
csv_file = "../dataset/colleges.csv"
output_filename = "../dataset/embedded_colleges.json"

In [22]:
print(f"Reading and chunking data from '{csv_file}")
try:
    df = pd.read_csv(csv_file)
    full_text = df.apply(lambda row: f"College: {row['name']}. University: {row['university']}.Location: {row['location']}. url: {row['url']}", axis = 1).str.cat(sep='\n')
except FileNotFoundError:
    print(f"error, {csv_file} not found")
    full_text = None


Reading and chunking data from '../dataset/colleges.csv


In [23]:
chunks = []
if full_text:
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = 1000,
        chunk_overlap = 100,
        separators= ["\n\n", "\n", ".", "!"]
    )
    chunks = text_splitter.split_text(full_text)
    print(f"Done, created {len(chunks)} chunks")

Done, created 25 chunks


In [24]:
# Generating embeddings

embedded_chunks = []
if chunks:
    print("Starting Embeding process for all chunks")
    for i, chunk in enumerate(chunks):
        embedded = genai.embed_content(
            model="models/embedding-001",
            content = chunk,
            task_type= "SEMANTIC_SIMILARITY"
        )['embedding']

        # Storing the chunk text and the embedding
        embedded_chunks.append({
            "chunk_text": chunk,
            "embedding": embedded
        })

    print(f"Embedding done, {len(embedded_chunks)} chunks embedded")

Starting Embeding process for all chunks
Embedding done, 25 chunks embedded


In [25]:
if embedded_chunks:
    with open(output_filename, 'w') as f:
        json.dump(embedded_chunks, f, indent = 4)
    print(f"\n Successfully saved chunks to {output_filename}")

else:
    print("OH NO")


 Successfully saved chunks to ../dataset/embedded_colleges.json


In [27]:
len(embedded_chunks[0])

2