In [25]:
dataset = [
    {"category": "Resources for an MBA abroad", "question": "What are resources for MBA abroad?", "url": "https://docs.google.com/document/d/e/2PACX-1vSUlhS67W5ZMDPcyOuQk-NQSinjd2K8-mLM_3LDJvjKhywGQ8rLZ1SnefvMqgQ7I4v4WgeBzehvAxS0/pub"},
    {"category": "Resources for an MBA in India", "question": "Tell me about MBA in India", "url": "https://docs.google.com/document/d/e/2PACX-1vQp9CzRr21BPH0rvHaOCGSI18qgNB6ys1ANIPfijPF4BIr4WZJL-EqtXWScylo5mhQ3c81fpNDttiMb/pub"},
    {"category": "Resources for an ME/MTech/PhD in India", "question": "What resources are available for ME/MTech/PhD in India?", "url": "https://docs.google.com/document/d/e/2PACX-1vS1KFz-1zhajew8l4wXsSWktvA9VoMqhLlegm74oVtls4Lo_w7G5KY48YGu6ZSneprkotJ9DHuiBfwd/pub"},
    {"category": "Resources for an MS/PhD Abroad", "question": "What are the resources for MS/PhD abroad?", "url": "https://docs.google.com/document/d/e/2PACX-1vT3NCHqZuWD8qtXMXmsPTH4MCtDxcwghq99QrIbBckQuLZY6ze-Zoyz9c9JHc0n1rlKjRLvCUjtOCnm/pub"},
    {"category": "Resources for Campus Placement", "question": "How to prepare for campus placement?", "url": "https://docs.google.com/document/d/e/2PACX-1vQlLBNig8Ke8a8mKC5lkamJRS_T-vK2AAgm9ocUJf-BnsgawQTWYu5K3g_g5wjp4ICW2noW4Y-agt7w/pub"},
    {"category": "Resources for Careers in the Indian Government", "question": "How to start a career in the Indian Government?", "url": "https://docs.google.com/document/d/e/2PACX-1vQl57OfH4cdm5WKuDc5wvQ3ir3VmpEv0NiBOYN1VG-DGKAv-1mM-16RGnTA69Gpz11yMQL8sySFNmtp/pub"},
    {"category": "Resources for Competitions", "question": "What resources are available for competitions?", "url": "https://docs.google.com/document/d/e/2PACX-1vRC-g5k4emTEAIpzDbrOWuNe-SlO8ZqFSuesGSvIVQS69JDq0pKp6p-nsx9LekkUkjnbf-hkpEcRkCN/pub"},
    {"category": "Resources for Entrepreneurship and Startups", "question": "How to start a startup or entrepreneurship journey?", "url": "https://docs.google.com/document/d/e/2PACX-1vQ2HTQJDC90ElSmFzwTdk7_IZGJWA4IBhnuuXVeDtZ-5uU5ifgQph79fBCEoNpdYVwYOezGqSJjZRah/pub"},
    {"category": "Resources for Final Year Projects", "question": "What are some resources for final year projects?", "url": "https://docs.google.com/document/e/2PACX-1vT_VX0_uoHMmPpeIY7M8Kwx8Bk_EZW8xY1P5M78WXxnWmsguXe_9hrlOcTkicTb4dR52CbPgKqxxJJs/pub"},
    {"category": "Resources for Improving English Communication", "question": "How to improve English communication skills?", "url": "https://docs.google.com/document/u/8/d/e/2PACX-1vTYcFVL-01ypnI0WYW_7fTCqTNV7rGLNS9aPXjub3QCMdiyLs5GHg7f2hg3Z9vCXwt2lQJCK9FoRA-2/pub"},
    {"category": "Resources for Internships", "question": "What are resources for finding internships?", "url": "https://docs.google.com/document/e/2PACX-1vQz7Shgl5wus3uaI31j1RYuMhighQ6qDrANsnAMvaeP59pn3QmOdI7dO0PiouMnMb5NH1v6KNEyRCxE/pub"},
    {"category": "Resources for Scholarships", "question": "Where to find scholarships and funding opportunities?", "url": "https://docs.google.com/document/e/2PACX-1vS2h_j2y9qeuRvXolGpZ9QNcmkHGenXH-B_0wlKY1ERL9U1Tpa8LQg89TQ9Y9CwH93UKoeiSb89yi-X/pub"},
    {"category": "Resources for Women", "question": "What resources are available for women?", "url": "https://docs.google.com/document/e/2PACX-1vQ0Mr5p-NgoKLg_RPAnKbt3Ig-aYInNFSGjWFLIzRStFm-bK_zBawOMCULWKUh6DY4FC040_gUHxaB-/pub"}
]


In [26]:
pip install sentence_transformers



In [27]:
expanded_dataset = dataset
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')

def generate_embeddings(texts):
    return model.encode(texts)

question_embeddings = generate_embeddings([item['question'] for item in expanded_dataset])
category_embeddings = generate_embeddings([item['category'] for item in expanded_dataset])

In [28]:
import numpy as np
import json
from scipy.spatial.distance import cosine
# Save embeddings
np.save('question_embeddings.npy', question_embeddings)
np.save('category_embeddings.npy', category_embeddings)

# Save dataset
with open('expanded_dataset.json', 'w') as f:
    json.dump(expanded_dataset, f)

In [29]:
class ChittiQAModel:
    def __init__(self, model_name='all-MiniLM-L6-v2'):
        self.model = SentenceTransformer(model_name)
        self.question_embeddings = np.load('question_embeddings.npy')
        self.category_embeddings = np.load('category_embeddings.npy')
        with open('expanded_dataset.json', 'r') as f:
            self.dataset = json.load(f)

    def find_best_match(self, query):
        query_embedding = self.model.encode([query])[0]
        similarity_threshold = 0.5
        question_similarities = [1 - cosine(query_embedding, qe) for qe in self.question_embeddings]
        category_similarities = [1 - cosine(query_embedding, ce) for ce in self.category_embeddings]

        combined_similarities = [0.7 * qs + 0.3 * cs for qs, cs in zip(question_similarities, category_similarities)]
        best_match_index = np.argmax(combined_similarities)
        best_match_similarity = combined_similarities[best_match_index]

        if best_match_similarity >= similarity_threshold:
            return self.dataset[best_match_index]['url']
        else:
            return None

    def predict(self, query):
        best_match_url = self.find_best_match(query)
        return best_match_url if best_match_url is not None else "I'm sorry, I couldn't find a specific resource for that question."


In [30]:
import pickle

# Create and save the model
model = ChittiQAModel()
with open('chitti_gold_qa_model.pkl', 'wb') as f:
    pickle.dump(model, f)

In [31]:
# Load the model
with open('chitti_gold_qa_model.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

# Use the model
query = "What do you like?"
result = loaded_model.predict(query)

print(f"{result}")

I'm sorry, I couldn't find a specific resource for that question.


In [35]:
query = "How to prepare for placements"
result = loaded_model.predict(query)

print(f"{result}")

https://docs.google.com/document/d/e/2PACX-1vQlLBNig8Ke8a8mKC5lkamJRS_T-vK2AAgm9ocUJf-BnsgawQTWYu5K3g_g5wjp4ICW2noW4Y-agt7w/pub
