In [None]:
#Sentence classification

In [2]:
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

In [1]:
!pip install transformers torch scikit-learn




[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
# Step 1: Load tokenizer & model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")

In [5]:
# Step 2: Create a small "FAQ database"
faq_sentences = [
 "How can I reset my password?",
 "Where is the library located?",
 "What is Artificial Intelligence?",
 "How to apply for a scholarship?",
 "What are the cafeteria opening hours?"
]

In [8]:
# Step 3: Function to get sentence embedding
def get_embedding(sentence):
    inputs = tokenizer(
        sentence,
        return_tensors="pt",
        truncation=True,
        padding=True
    )
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

In [9]:
# Step 4: Get embeddings for all FAQ sentences
faq_embeddings = [get_embedding(sent) for sent in faq_sentences]

In [14]:
# Step 5: Student enters a query
query = "Apply scholarship"
query_embedding = get_embedding(query)

In [15]:
# Step 6: Compare with cosine similarity
similarities = [cosine_similarity(query_embedding, emb)[0][0] for
emb in faq_embeddings]

In [16]:
# Step 7: Find most similar question
best_match_index = similarities.index(max(similarities))
print(f"Student Question: {query}")
print(f"Most Similar FAQ: {faq_sentences[best_match_index]}")

Student Question: Apply scholarship
Most Similar FAQ: How to apply for a scholarship?


In [None]:
#Text clssification

In [17]:
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

In [18]:
# Step 1: Load BERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")

In [19]:
# Step 2: Cricketers and their countries
cricketers = {
 "MS Dhoni": "India",
 "Virat Kohli": "India",
 "Ricky Ponting": "Australia",
 "Steve Smith": "Australia",
 "Kane Williamson": "New Zealand",
 "Joe Root": "England"
}

In [22]:
# Step 3: Function to get BERT embedding
def get_embedding(text):
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True
    )
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

In [23]:
# Step 4: Function to check cricketer-country similarity
def check_country_similarity(player, country):
 player_emb = get_embedding(player)
 country_emb = get_embedding(country)
 similarity_score = cosine_similarity(player_emb,
country_emb)[0][0]
 return similarity_score

In [24]:
# Step 5: Test with Dhoni and others
test_cases = [
 ("MS Dhoni", "India"),
 ("MS Dhoni", "Australia"),
 ("Ricky Ponting", "Australia"),
 ("Virat Kohli", "India"),
 ("Kane Williamson", "New Zealand"),
 ("Joe Root", "India")
]

In [26]:
# Step 6: Classify based on similarity threshold
threshold = 0.63 # You can adjust this
for player, country in test_cases:
 score = check_country_similarity(player, country)
 match = "MATCH " if score >= threshold else "NO MATCH "
 print(f"{player} - {country} | Similarity: {score:.2f} | {match}")

MS Dhoni - India | Similarity: 0.64 | MATCH 
MS Dhoni - Australia | Similarity: 0.58 | NO MATCH 
Ricky Ponting - Australia | Similarity: 0.55 | NO MATCH 
Virat Kohli - India | Similarity: 0.53 | NO MATCH 
Kane Williamson - New Zealand | Similarity: 0.55 | NO MATCH 
Joe Root - India | Similarity: 0.68 | MATCH 
