In [4]:
import spacy
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load Spacy's pre-trained word vectors (this will load English embeddings)
nlp = spacy.load("en_core_web_md")

# Sample dataset with 100-500 records
students = [
    {'name': 'Student A', 'skills': ['python', 'machine learning'], 'interests': ['AI', 'data science'], 'location': 'NY'},
    {'name': 'Student B', 'skills': ['javascript', 'web development'], 'interests': ['frontend', 'UX'], 'location': 'CA'},
    # Add more students here, up to 500
]

alumni = [
    {'name': 'Alumni X', 'skills': ['python', 'data science'], 'interests': ['AI', 'deep learning'], 'location': 'NY', 'mentorship': True, 'index': 0},
    {'name': 'Alumni Y', 'skills': ['javascript', 'React'], 'interests': ['web development', 'frontend'], 'location': 'CA', 'mentorship': True, 'index': 1},
    {'name': 'Alumni Z', 'skills': ['machine learning', 'big data'], 'interests': ['AI', 'cloud'], 'location': 'TX', 'mentorship': False, 'index': 2},
    # Add more alumni here, up to 500
]

# Helper function to get word vectors for the skills and interests
def get_embedding(text):
    return nlp(text).vector

# Vectorize skills and interests using NLP word embeddings
def vectorize_profiles(profiles):
    skill_embeddings = []
    interest_embeddings = []
    
    for profile in profiles:
        skills_text = " ".join(profile['skills'])
        interests_text = " ".join(profile['interests'])
        
        # Get average word embedding for skills and interests
        skill_embedding = get_embedding(skills_text)
        interest_embedding = get_embedding(interests_text)
        
        skill_embeddings.append(skill_embedding)
        interest_embeddings.append(interest_embedding)
    
    return np.array(skill_embeddings), np.array(interest_embeddings)

# Calculate weighted similarity score using NLP embeddings
def calculate_weighted_similarity(student, alumni, skill_vectors, interest_vectors, skill_weight=0.7, interest_weight=0.3):
    # Get embeddings for student's skills and interests
    student_skills = " ".join(student['skills'])
    student_interests = " ".join(student['interests'])
    
    student_skill_vector = get_embedding(student_skills)
    student_interest_vector = get_embedding(student_interests)
    
    # Calculate similarity using cosine similarity
    skill_similarity = cosine_similarity([student_skill_vector], [skill_vectors[alumni['index']]])[0][0]
    interest_similarity = cosine_similarity([student_interest_vector], [interest_vectors[alumni['index']]])[0][0]
    
    # Weighted similarity score
    weighted_similarity = (skill_similarity * skill_weight) + (interest_similarity * interest_weight)
    
    # Boost for location match
    if student['location'] == alumni['location']:
        weighted_similarity += 0.1
    
    # Boost for mentorship availability
    if alumni['mentorship']:
        weighted_similarity += 0.1
    
    return round(weighted_similarity * 100)  # Return score as a percentage

# Match students with alumni based on similarity scores
def match_students_to_alumni(students, alumni, skill_vectors, interest_vectors):
    matches = {}
    
    for student in students:
        student_matches = []
        
        for alum in alumni:
            score = calculate_weighted_similarity(student, alum, skill_vectors, interest_vectors)
            student_matches.append({'alumni': alum['name'], 'score': score})
        
        # Sort matches by score in descending order
        sorted_matches = sorted(student_matches, key=lambda x: x['score'], reverse=True)
        matches[student['name']] = sorted_matches[:5]  # Top 5 matches
    
    return matches

# Main function to process the dataset
def process_dataset():
    skill_vectors, interest_vectors = vectorize_profiles(alumni)
    results = match_students_to_alumni(students, alumni, skill_vectors, interest_vectors)

    # Display the matching results
    for student, matches in results.items():
        print(f"Matches for {student}:")
        for match in matches:
            print(f"  {match['alumni']} with a match score of {match['score']}%")

if __name__ == "__main__":
    process_dataset()


Matches for Student A:
  Alumni X with a match score of 94%
  Alumni Z with a match score of 77%
  Alumni Y with a match score of 67%
Matches for Student B:
  Alumni Y with a match score of 84%
  Alumni X with a match score of 65%
  Alumni Z with a match score of 52%
