In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics.pairwise import cosine_similarity
import gensim
from gensim.models import Word2Vec

In [2]:
# Load Dataset
file_path = "Main_engineering_students_dataset.csv"  # Replace with your dataset path
data = pd.read_csv(file_path)

In [3]:
# Preview Dataset
print(data.head())

        Name                  Branch  Pass-out Year  Gender Skills_1  \
0  Student_1  Information Technology           2027    Male      C++   
1  Student_2                   Civil           2027  Female     Java   
2  Student_3             Electronics           2025    Male     Java   
3  Student_4                   Civil           2024  Female    React   
4  Student_5             Electronics           2024    Male     DBMS   

     Skills_2 Skills_3       Interests_1   Interests_2       Interests_3  ...  \
0     Node.js     Java            Coding       Drawing      Data Science  ...   
1         CSS      SQL            Sports        Coding            Gaming  ...   
2     Node.js   Python  Machine Learning  UI/UX Design            Sports  ...   
3  JavaScript     Java            Coding  UI/UX Design  Machine Learning  ...   
4       React      C++           Drawing         Music            Gaming  ...   

  Hobbies_2 Hobbies_3 10th Marks  12th Marks/Diploma Marks  \
0   Reading   Sing

In [4]:
# Retain the Name column separately for later use
names = data['Name']

In [5]:
# Preprocessing Categorical Features
categorical_features = ['Branch', 'Gender', 'Dream Companies']
categorical_transformer = OneHotEncoder()

In [6]:
# Preprocessing Numerical Features
numeric_features = ['10th Marks', '12th Marks/Diploma Marks', 'Pass-out Year']
numeric_transformer = StandardScaler()

In [7]:
# Concatenate Textual Features for Word2Vec
textual_columns = [
    'Skills_1', 'Skills_2', 'Skills_3',
    'Interests_1', 'Interests_2', 'Interests_3',
    'Hobbies_1', 'Hobbies_2', 'Hobbies_3',
    'Spoken Languages_1', 'Spoken Languages_2', 'Spoken Languages_3',
    'Currently Working On', 'Study Goals'
]
data['Combined_Text'] = data[textual_columns].fillna('').agg(' '.join, axis=1)


In [8]:
# Preprocess text: Tokenize sentences into words
tokenized_sentences = [sentence.split() for sentence in data['Combined_Text'].tolist()]

In [9]:
# Train Word2Vec Model
print("Training Word2Vec model on textual data...")
word2vec_model = Word2Vec(sentences=tokenized_sentences, vector_size=100, window=5, min_count=1, workers=4)
word2vec_model.save("word2vec.model")  # Optionally, save the model for future use


Training Word2Vec model on textual data...


In [10]:
# Generate Word2Vec Embeddings for each sentence
def get_sentence_embedding(sentence, model):
    tokens = sentence.split()
    word_embeddings = [model.wv[word] for word in tokens if word in model.wv]
    if word_embeddings:
        return np.mean(word_embeddings, axis=0)
    else:
        return np.zeros(model.vector_size)

In [11]:
# Generate embeddings for all sentences
text_embeddings = [get_sentence_embedding(sentence, word2vec_model) for sentence in data['Combined_Text'].tolist()]


In [12]:
# Preprocess Numerical and Categorical Features
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features),
        ('num', numeric_transformer, numeric_features)
    ],
    remainder='drop'
)


In [13]:
# Apply Preprocessing
print("Preprocessing categorical and numerical features...")
processed_features = preprocessor.fit_transform(data)


Preprocessing categorical and numerical features...


In [14]:
# Combine Word2Vec Textual Embeddings with Other Features(categorical and numerical features)
print("Combining all feature vectors...")
final_features = np.hstack([text_embeddings, processed_features])


Combining all feature vectors...


In [15]:
# Calculate Cosine Similarity based on final_features
print("Calculating cosine similarity matrix...")
similarity_matrix = cosine_similarity(final_features)


Calculating cosine similarity matrix...


In [29]:
# Weighting for similarity calculation
weights = {
    'textual': 0.4,  # Textual features (Word2Vec embeddings)
    'numerical': 0.3,  # Numerical features
    'categorical': 0.3  # Categorical features
}


In [30]:
# Custom weighted similarity calculation
def compute_weighted_similarity(student_idx, data, similarity_matrix, weights):
    text_similarity = similarity_matrix[student_idx]
    numerical_similarity = similarity_matrix[student_idx]  # Placeholder (reuse similarity_matrix for now)
    categorical_similarity = similarity_matrix[student_idx]  # Placeholder (reuse similarity_matrix for now)

    # Apply weights
    weighted_similarity = (
        weights['textual'] * text_similarity +
        weights['numerical'] * numerical_similarity +
        weights['categorical'] * categorical_similarity
    )
    return weighted_similarity

In [31]:
# Calculate Weighted Similarity Matrix
print("Calculating weighted similarity matrix...")
weighted_similarity_matrix = np.zeros((len(data), len(data)))
for i in range(len(data)):
    weighted_similarity_matrix[i] = compute_weighted_similarity(i, data, similarity_matrix, weights)


Calculating weighted similarity matrix...


In [45]:
# Recommendation Function
def recommend_study_mates(student_name, data, similarity_matrix, top_n=5):
    # Find the index of the student
    if student_name not in data['Name'].values:
        print(f"Student '{student_name}' not found in the dataset!")
        return

    student_idx = data[data['Name'] == student_name].index[0]

    # Retrieve similarity scores for the student
    similarity_scores = similarity_matrix[student_idx]

    # Sort by similarity scores in descending order
    similar_students_idx = np.argsort(similarity_scores)[::-1]

    # Exclude the student themself from recommendations
    similar_students_idx = [idx for idx in similar_students_idx if idx != student_idx]

    # Get top N recommendations
    top_similar_students = similar_students_idx[:top_n]

    # Prepare the output for recommendations
    recommendations = data.iloc[top_similar_students][[
        'Name', 'Branch', 'Pass-out Year', 'Gender', 'Skills_1', 'Skills_2', 'Skills_3',
        'Interests_1', 'Interests_2', 'Interests_3', 'Hobbies_1', 'Hobbies_2', 'Hobbies_3',
        '10th Marks', '12th Marks/Diploma Marks', 'Study Goals', 'Dream Companies',
        'Currently Working On', 'Spoken Languages_1', 'Spoken Languages_2', 'Spoken Languages_3'
    ]]
    recommendations['Similarity'] = similarity_scores[top_similar_students] * 100  # Convert to percentage

    # Print the features of the input student
    student_features = data.iloc[student_idx][[
        'Name', 'Branch', 'Pass-out Year', 'Gender', 'Skills_1', 'Skills_2', 'Skills_3',
        'Interests_1', 'Interests_2', 'Interests_3', 'Hobbies_1', 'Hobbies_2', 'Hobbies_3',
        '10th Marks', '12th Marks/Diploma Marks', 'Study Goals', 'Dream Companies',
        'Currently Working On', 'Spoken Languages_1', 'Spoken Languages_2', 'Spoken Languages_3'
    ]]

    print(f"\nFeatures of {student_name}:\n")
    print(f"Name: {student_features['Name']}")
    print(f"Branch: {student_features['Branch']}")
    print(f"Pass-out Year: {student_features['Pass-out Year']}")
    print(f"Gender: {student_features['Gender']}")
    print(f"Skills: {student_features['Skills_1']}, {student_features['Skills_2']}, {student_features['Skills_3']}")
    print(f"Interests: {student_features['Interests_1']}, {student_features['Interests_2']}, {student_features['Interests_3']}")
    print(f"Hobbies: {student_features['Hobbies_1']}, {student_features['Hobbies_2']}, {student_features['Hobbies_3']}")
    print(f"10th Marks: {student_features['10th Marks']}")
    print(f"12th Marks/Diploma Marks: {student_features['12th Marks/Diploma Marks']}")
    print(f"Study Goals: {student_features['Study Goals']}")
    print(f"Dream Companies: {student_features['Dream Companies']}")
    print(f"Currently Working On: {student_features['Currently Working On']}")
    print(f"Spoken Languages: {student_features['Spoken Languages_1']}, {student_features['Spoken Languages_2']}, {student_features['Spoken Languages_3']}\n")

    # Print the top N recommendations in the same structured format
    print(f"Top {top_n} Study Mate Recommendations for {student_name}:\n")
    
    for index, row in recommendations.iterrows():
        print(f"Name: {row['Name']}")
        print(f"Branch: {row['Branch']}")
        print(f"Pass-out Year: {row['Pass-out Year']}")
        print(f"Gender: {row['Gender']}")
        print(f"Skills: {row['Skills_1']}, {row['Skills_2']}, {row['Skills_3']}")
        print(f"Interests: {row['Interests_1']}, {row['Interests_2']}, {row['Interests_3']}")
        print(f"Hobbies: {row['Hobbies_1']}, {row['Hobbies_2']}, {row['Hobbies_3']}")
        print(f"10th Marks: {row['10th Marks']}")
        print(f"12th Marks/Diploma Marks: {row['12th Marks/Diploma Marks']}")
        print(f"Study Goals: {row['Study Goals']}")
        print(f"Dream Companies: {row['Dream Companies']}")
        print(f"Currently Working On: {row['Currently Working On']}")
        print(f"Spoken Languages: {row['Spoken Languages_1']}, {row['Spoken Languages_2']}, {row['Spoken Languages_3']}")
        print(f"Similarity: {row['Similarity']:.6f}%\n")


In [51]:
# Example Usage
recommend_study_mates('Student_3', data, weighted_similarity_matrix, top_n=10)



Features of Student_3:

Name: Student_3
Branch: Electronics
Pass-out Year: 2025
Gender: Male
Skills: Java, Node.js, Python
Interests: Machine Learning, UI/UX Design, Sports
Hobbies: Gardening, Cricket, Football
10th Marks: 78
12th Marks/Diploma Marks: 90
Study Goals: Entrepreneurship
Dream Companies: Wipro
Currently Working On: Personal Project
Spoken Languages: Hindi, Marathi, Bengali

Top 10 Study Mate Recommendations for Student_3:

Name: Student_434
Branch: Electronics
Pass-out Year: 2026
Gender: Male
Skills: SQL, Python, Node.js
Interests: Sports, Drawing, UI/UX Design
Hobbies: Dancing, Yoga, Gardening
10th Marks: 75
12th Marks/Diploma Marks: 89
Study Goals: Good Job Placement
Dream Companies: Wipro
Currently Working On: Personal Project
Spoken Languages: Bengali, Marathi, Telugu
Similarity: 95.606533%

Name: Student_563
Branch: Electronics
Pass-out Year: 2024
Gender: Male
Skills: Python, React, Java
Interests: Machine Learning, Coding, Music
Hobbies: Gardening, Cricket, Reading
