In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("watts2/glove6b50dtxt")

print("Path to dataset files:", path)

In [18]:
import numpy as np

def load_glove_vectors(filename):
    word_to_vec_map = {}
    with open(filename, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            word_to_vec_map[word] = coefs
    return word_to_vec_map

# Use the path to your unzipped file
word_to_vec_map = load_glove_vectors("/content/1/glove.6B.50d.txt")

print(f"Loaded {len(word_to_vec_map)} words.")
# Quick check:
# print(word_to_vec_map["mental"])

Loaded 400000 words.


In [23]:
# UNQ_C1 (UNIQUE CELL IDENTIFIER, DO NOT EDIT)
# GRADED FUNCTION: sentence_to_avg

def sentence_to_avg(sentence, word_to_vec_map):
    """
    Converts a sentence (string) into a list of words (strings). Extracts the GloVe representation of each word
    and averages its value into a single vector encoding the meaning of the sentence.

    Arguments:
    sentence -- string, one training example from X
    word_to_vec_map -- dictionary mapping every word in a vocabulary into its 50-dimensional vector representation

    Returns:
    avg -- average vector encoding information about the sentence, numpy-array of shape (J,), where J can be any number
    """
    # Get a valid word contained in the word_to_vec_map.
    any_word = next(iter(word_to_vec_map.keys()))

    ### START CODE HERE ###
    # Step 1: Split sentence into list of lower case words (≈ 1 line)
    words = sentence.lower().split()

    # Initialize the average word vector, should have the same shape as your word vectors.
    # Use `np.zeros` and pass in the argument of any word's word 2 vec's shape
    avg = np.zeros_like(word_to_vec_map[any_word])

    # Initialize count to 0
    count = 0

    # Step 2: average the word vectors. You can loop over the words in the list "words".
    for w in words:
        # Check that word exists in word_to_vec_map
        if w in word_to_vec_map:
            avg += word_to_vec_map[w]
            # Increment count
            count +=1

    if count > 0:
        # Get the average. But only if count > 0
        avg = avg / count

    ### END CODE HERE ###

    return avg

def cosine_similarity(u, v):
    """
    u and v are 50D vectors
    Returns a scalar between -1 and 1
    """
    dot = np.dot(u, v)
    norm_u = np.sqrt(np.sum(u**2))
    norm_v = np.sqrt(np.sum(v**2))

    return dot / (norm_u * norm_v)

In [14]:
import pandas as pd
import numpy as np

df = pd.read_csv("train.csv")

def process_text(text):
    return [word.lower() for word in str(text).split()]

x_list = df["Context"].apply(process_text).tolist()
y_list = df["Response"].apply(process_text).tolist()

x = np.array(x_list, dtype=object)
y = np.array(y_list, dtype=object)

print("Shape of x:", x.shape)
print("Shape of y:", y.shape)

print("\nSample from x:", x[0])
print("Sample from y:", y[0])

Shape of x: (3512,)
Shape of y: (3512,)

Sample from x: ["i'm", 'going', 'through', 'some', 'things', 'with', 'my', 'feelings', 'and', 'myself.', 'i', 'barely', 'sleep', 'and', 'i', 'do', 'nothing', 'but', 'think', 'about', 'how', "i'm", 'worthless', 'and', 'how', 'i', "shouldn't", 'be', 'here.', "i've", 'never', 'tried', 'or', 'contemplated', 'suicide.', "i've", 'always', 'wanted', 'to', 'fix', 'my', 'issues,', 'but', 'i', 'never', 'get', 'around', 'to', 'it.', 'how', 'can', 'i', 'change', 'my', 'feeling', 'of', 'being', 'worthless', 'to', 'everyone?']
Sample from y: ['if', 'everyone', 'thinks', "you're", 'worthless,', 'then', 'maybe', 'you', 'need', 'to', 'find', 'new', 'people', 'to', 'hang', 'out', 'with.seriously,', 'the', 'social', 'context', 'in', 'which', 'a', 'person', 'lives', 'is', 'a', 'big', 'influence', 'in', 'self-esteem.otherwise,', 'you', 'can', 'go', 'round', 'and', 'round', 'trying', 'to', 'understand', 'why', "you're", 'not', 'worthless,', 'then', 'go', 'back', 'to'

In [20]:
X_avg = np.zeros((len(x), 50)) #  50dimensional GloVe vectors
for i, sentence in enumerate(x):
    # Join the list of words back into a sentence string before passing to sentence_to_avg
    X_avg[i, :] = sentence_to_avg(" ".join(sentence), word_to_vec_map)

Y_avg = np.zeros((len(y), 50)) # Assuming 50-dimensional GloVe vectors
for i, sentence in enumerate(y):
    # Join the list of words back into a sentence string before passing to sentence_to_avg
    Y_avg[i, :] = sentence_to_avg(" ".join(sentence), word_to_vec_map)

print("Shape of X_avg:", X_avg.shape)
print("Sample from X_avg[0]:", X_avg[0])
print("Shape of Y_avg:", Y_avg.shape)
print("Sample from Y_avg[0]:", Y_avg[0])

Shape of X_avg: (3512, 50)
Sample from X_avg[0]: [ 3.20663095e-01 -1.34526091e-02  5.09688482e-02 -4.20556724e-01
  4.55177903e-01  3.61130424e-02 -3.64430010e-01  1.48210585e-01
 -4.07317728e-01  1.12481192e-01 -6.54325560e-02  3.89136791e-01
 -4.57271159e-01 -1.58018991e-01  7.27602482e-01  4.51524943e-01
  1.13516107e-01  1.49551658e-02  4.31046542e-03 -6.17001295e-01
 -1.40114322e-01  4.68794078e-01  4.92342889e-01  6.54025599e-02
  5.70560873e-01 -1.78138566e+00 -5.94556510e-01  1.72539726e-01
  7.04690576e-01 -5.59863269e-01  3.22232842e+00  4.27696347e-01
 -3.43404233e-01 -3.23904604e-01 -7.69872591e-02 -6.29245043e-02
  4.63481285e-02  2.46352062e-01  1.92933530e-01 -2.58632720e-01
 -2.07722038e-01  1.72711723e-03  5.48051782e-02  4.87265021e-01
  1.78592149e-02  2.27993485e-02 -1.04879856e-01 -1.39985636e-01
 -1.06599674e-01  1.87141627e-01]
Shape of Y_avg: (3512, 50)
Sample from Y_avg[0]: [ 0.35460755  0.13424733  0.01826309 -0.17842031  0.5244537   0.1702563
 -0.27376515 -0.

In [19]:
sample_sentence = "i'm going through some things with my feelings and myself. i barely sleep and i do nothing but think about how i'm worthless and how i shouldn't be here."
sentence_avg_vector = sentence_to_avg(sample_sentence, word_to_vec_map)
print("Average vector for the sample sentence:")
print(sentence_avg_vector)
print("Shape of the average vector:", sentence_avg_vector.shape)

Average vector for the sample sentence:
[ 3.1738600e-01  2.9219756e-02 -3.6562482e-04 -3.9140096e-01
  4.6632504e-01  3.8936365e-02 -3.6475170e-01  7.1111210e-02
 -4.2038286e-01  1.5579800e-01 -1.2603195e-01  3.6152682e-01
 -4.8985612e-01 -1.7449398e-01  7.4724251e-01  3.9937147e-01
  1.3942277e-01  2.9140307e-02  8.2564922e-03 -6.5623337e-01
 -1.7651133e-01  5.6996685e-01  6.0269123e-01  7.2351195e-02
  6.3015360e-01 -1.7017046e+00 -6.3810748e-01  2.2088563e-01
  7.2822809e-01 -5.8090907e-01  3.3109086e+00  4.7035488e-01
 -2.1970063e-01 -3.6597267e-01 -1.1519405e-01 -5.6312710e-02
 -2.8139876e-02  2.6753119e-01  2.5225717e-01 -1.9168071e-01
 -2.2650503e-01  4.7692880e-02  7.6816879e-02  5.0755048e-01
  8.2175702e-02  6.7064174e-02 -8.6732380e-02 -1.3369381e-01
 -1.5067759e-01  2.4627985e-01]
Shape of the average vector: (50,)


In [30]:
def get_answer(user_query, question_matrix, df, word_to_vec_map):
    # 1. Vectorize the user's input
    query_vec = sentence_to_avg(user_query, word_to_vec_map)

    # 2. Compare against every question in your database
    # We'll store all similarities in a list
    similarities = []
    for q_vec in question_matrix:
        sim = cosine_similarity(query_vec, q_vec)
        similarities.append(sim)

    # 3. Find the index of the highest similarity
    best_idx = np.argmax(similarities)

    # 4. Return the corresponding answer
    return df.iloc[best_idx]['Response'], similarities[best_idx]

In [31]:
class Model:
  def __init__(self, question_matrix, df, word_to_vec_map):
    self.question_matrix = question_matrix
    self.df = df
    self.word_to_vec_map = word_to_vec_map

  def __call__(self, user_query):
    return get_answer(user_query, self.question_matrix, self.df, self.word_to_vec_map)

model = Model(X_avg, df, word_to_vec_map)

In [50]:
# use vector search
context =  "I am feeling very overwhelmed with my work." # df["Context"][0]
answer = model(context)

In [51]:
context

'I am feeling very overwhelmed with my work.'

In [52]:
answer

('It is hard to make a definite diagnosis however I would say \xa0that it could be and or a combination of depression, stress, PTSD, etc. \xa0More background information would have to be needed. \xa0One thing I would say is that you really need to seek guidance from a professional to work through these symptoms. \xa0If what you described as a caged animal is released, all the crying, sadness, and controllable emotions will come out. \xa0That would not be good when they do. \xa0Get help before they do..',
 np.float64(0.9687353701168416))

# **this is the end of the lab for vector search, the rest are jus experiments**

# now I am gonna use RAG with gpt 2 to pick the best answer from the vector search.



In [58]:
from transformers import pipeline, set_seed

# this will download the model weights
generator = pipeline('text-generation', model='gpt2')

def generate_assistant_response(user_query, question_matrix, df, word_to_vec_map):
    # --- Part 1: same previous logicc from the rag
    query_vec = sentence_to_avg(user_query, word_to_vec_map)
    similarities = [cosine_similarity(query_vec, q_vec) for q_vec in question_matrix]
    best_idx = np.argmax(similarities)

    # vector search using cos sim
    retrieved_answer = df.iloc[best_idx]['Response']

    # --- Part 2: The Generation Logic using gpt2
    # i provide "Context" so the model doesn't just make things up
    prompt = f"Context: {retrieved_answer}\n\nQuestion: {user_query}\n\nAssistant's helpful response:"

    # Generate the response
    response = generator("hi", max_length=150, num_return_sequences=1, truncation=True)

    return response[0]['generated_text']

# Test it out!
print(generate_assistant_response("I feel very anxious", X_avg, df, word_to_vec_map))

Loading weights:   0%|          | 0/148 [00:00<?, ?it/s]

GPT2LMHeadModel LOAD REPORT from: gpt2
Key                  | Status     |  | 
---------------------+------------+--+-
h.{0...11}.attn.bias | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=256) and `max_length`(=150) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


hi, I think he's a good shooter. He's got good instincts. He's got some great instincts. He's got good hands. He's got a good understanding of where he's going and what he's going to do."

J.J. Barea, a point guard who averaged 11.3 points and 6.8 assists in 30 games with Dallas last season, said he's "never been a big threat or a big shooter."

"He's got to be able to score fast," Barea said. "He's got to be able to get open. He's a good shooter. He's got to be able to do a good job. He's got to be able to get into the paint."

Barea's offensive game has been a problem for the Mavericks, who have struggled to score in the paint.

The Mavericks (9-4) have won four straight and have lost four straight road games, including a five-game trip to Portland on Wednesday.

Copyright 2013 by STATS LLC and Associated Press. Any commercial use or distribution without the express written consent of STATS LLC and Associated Press is strictly prohibited


OFC, Halllucination, lets try flan-t5

In [60]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# 1. Load a model that is actually trained to follow instructions
model_name = "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)


def generate_clean_response(user_query, retrieved_answer):

    prompt = f"Answer the following question using the provided context.\nContext: {retrieved_answer}\nQuestion: {user_query}"

    # Encode the prompt
    inputs = tokenizer(prompt, return_tensors="pt")


    # T5 is much more concise and less likely to hallucinate
    output_tokens = model.generate(
        **inputs,
        max_length=100,
        do_sample=False,
        num_return_sequences=1
    )
    # Decode the generated tokens
    generated_text = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
    return generated_text

Loading weights:   0%|          | 0/190 [00:00<?, ?it/s]



In [63]:
def generate_assistant_response(user_query, question_matrix, df, word_to_vec_map):
    # --- Part 1: same previous logicc from the rag
    query_vec = sentence_to_avg(user_query, word_to_vec_map)
    similarities = [cosine_similarity(query_vec, q_vec) for q_vec in question_matrix]
    best_idx = np.argmax(similarities)

    # vector search using cos sim
    retrieved_answer = df.iloc[best_idx]['Response']

    # --- Part 2: The Generation Logic using gpt2
    # i provide "Context" so the model doesn't just make things up
    prompt = f"Context: {retrieved_answer}\n\nQuestion: {user_query}\n\nAssistant's helpful response:"

    # Generate the response
    response =  generate_clean_response("hi", "how are you")   #generator("hi", max_length=150, num_return_sequences=1, truncation=True)

    return response

# Test it out!
print(generate_assistant_response("I feel very anxious", X_avg, df, word_to_vec_map))

i am a student


# **both are small LMs so no wonder why they suck at understanding context. the next lab will be a better soln than these LMs**