In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

tutors = pd.read_csv("tutors.csv")
students = pd.read_csv("students.csv")

print("Tutors shape:", tutors.shape)
print("Students shape:", students.shape)



Tutors shape: (200, 9)
Students shape: (300, 6)


#### Merging both datasets

In [3]:
combined = tutors.merge(
    students,
    left_on="subject",
    right_on="required_subject",
    how="inner",
    suffixes=("_tutor", "_student")
)

print("Combined shape:", combined.shape)
combined.head()

#Each row now represents a tutor-student pair with the same subject interest.


Combined shape: (6634, 15)


Unnamed: 0,tutor_id,tutor_name,subject,experience_years,rating,hourly_rate,location,location_lat_tutor,location_long_tutor,student_id,student_name,required_subject,grade,location_lat_student,location_long_student
0,T001,Ekaja Dyal,Science,12,3.19,293,Noida,28.502147,77.393178,S006,Harinakshi Kapadia,Science,8,27.137237,77.978734
1,T001,Ekaja Dyal,Science,12,3.19,293,Noida,28.502147,77.393178,S011,Sudiksha Sangha,Science,9,24.7908,85.053012
2,T001,Ekaja Dyal,Science,12,3.19,293,Noida,28.502147,77.393178,S012,Vedant Lalla,Science,6,25.623021,85.158342
3,T001,Ekaja Dyal,Science,12,3.19,293,Noida,28.502147,77.393178,S015,Watika Sarma,Science,6,28.574179,77.413194
4,T001,Ekaja Dyal,Science,12,3.19,293,Noida,28.502147,77.393178,S017,Jagdish Saxena,Science,9,28.628533,77.423353


In [4]:
def haversine(lat1, lon1, lat2, lon2):
    R = 6371  #Earth's radius in km
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])  # convert degrees to radians
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1)*np.cos(lat2)*np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    return R * c

# Apply it to each tutor-student pair
combined["distance_km"] = haversine(
    combined["location_lat_tutor"],
    combined["location_long_tutor"],
    combined["location_lat_student"],
    combined["location_long_student"]
)


###### Now we know how far each tutor is from each student which crucial for matching nearby tutors.

In [7]:
combined["subject_match"] = np.where(
    combined["subject"] == combined["required_subject"], 1, 0)


In [8]:
combined.head()


Unnamed: 0,tutor_id,tutor_name,subject,experience_years,rating,hourly_rate,location,location_lat_tutor,location_long_tutor,student_id,student_name,required_subject,grade,location_lat_student,location_long_student,distance_km,subject_match
0,T001,Ekaja Dyal,Science,12,3.19,293,Noida,28.502147,77.393178,S006,Harinakshi Kapadia,Science,8,27.137237,77.978734,162.327636,1
1,T001,Ekaja Dyal,Science,12,3.19,293,Noida,28.502147,77.393178,S011,Sudiksha Sangha,Science,9,24.7908,85.053012,865.628991,1
2,T001,Ekaja Dyal,Science,12,3.19,293,Noida,28.502147,77.393178,S012,Vedant Lalla,Science,6,25.623021,85.158342,832.647901,1
3,T001,Ekaja Dyal,Science,12,3.19,293,Noida,28.502147,77.393178,S015,Watika Sarma,Science,6,28.574179,77.413194,8.244761,1
4,T001,Ekaja Dyal,Science,12,3.19,293,Noida,28.502147,77.393178,S017,Jagdish Saxena,Science,9,28.628533,77.423353,14.359122,1


In [9]:
#Feature Scaling
scaler = MinMaxScaler()
combined["rating_scaled"] = scaler.fit_transform(combined[["rating"]])
combined["experience_scaled"] = scaler.fit_transform(combined[["experience_years"]])
combined["rate_scaled"] = scaler.fit_transform(combined[["hourly_rate"]])
combined.head()

Unnamed: 0,tutor_id,tutor_name,subject,experience_years,rating,hourly_rate,location,location_lat_tutor,location_long_tutor,student_id,student_name,required_subject,grade,location_lat_student,location_long_student,distance_km,subject_match,rating_scaled,experience_scaled,rate_scaled
0,T001,Ekaja Dyal,Science,12,3.19,293,Noida,28.502147,77.393178,S006,Harinakshi Kapadia,Science,8,27.137237,77.978734,162.327636,1,0.090452,0.785714,0.115578
1,T001,Ekaja Dyal,Science,12,3.19,293,Noida,28.502147,77.393178,S011,Sudiksha Sangha,Science,9,24.7908,85.053012,865.628991,1,0.090452,0.785714,0.115578
2,T001,Ekaja Dyal,Science,12,3.19,293,Noida,28.502147,77.393178,S012,Vedant Lalla,Science,6,25.623021,85.158342,832.647901,1,0.090452,0.785714,0.115578
3,T001,Ekaja Dyal,Science,12,3.19,293,Noida,28.502147,77.393178,S015,Watika Sarma,Science,6,28.574179,77.413194,8.244761,1,0.090452,0.785714,0.115578
4,T001,Ekaja Dyal,Science,12,3.19,293,Noida,28.502147,77.393178,S017,Jagdish Saxena,Science,9,28.628533,77.423353,14.359122,1,0.090452,0.785714,0.115578


In [14]:
combined["tutor_score"] = (
    (0.5 * combined["rating_scaled"]) +
    (0.3 * combined["experience_scaled"]) +
    (0.2 * (1 - combined["rate_scaled"]))  # invert rate since lower is better
)


Explanation:

Weighted average formula where:

Rating = 50% weight

Experience = 30% weight

Hourly rate = 20% weight (but inverted)

Final tutor_score lies between 0–1 (higher = better).

In [None]:
combined["nearby_flag"] = np.where(combined["distance_km"] <= 3, 1, 0)
# If tutor is within 3 km -> flag = 1 (nearby)
# Otherwise -> 0
# This can be used later for filtering local tutors quickly.

In [21]:
combined[[
    "tutor_name", "student_name", "subject", 
    "rating", "experience_years", "hourly_rate", 
    "distance_km", "tutor_score", "nearby_flag"
]].tail(10)


Unnamed: 0,tutor_name,student_name,subject,rating,experience_years,hourly_rate,distance_km,tutor_score,nearby_flag
6624,Ekantika Krishnamurthy,Varenya Pillai,Economics,4.72,15,381,726.914557,0.884422,0
6625,Ekantika Krishnamurthy,Luke Mander,Economics,4.72,15,381,4.001537,0.884422,0
6626,Ekantika Krishnamurthy,Faqid Krishnamurthy,Economics,4.72,15,381,915.519378,0.884422,0
6627,Ekantika Krishnamurthy,Varenya Vasa,Economics,4.72,15,381,733.622953,0.884422,0
6628,Ekantika Krishnamurthy,Ati Kohli,Economics,4.72,15,381,534.101442,0.884422,0
6629,Ekantika Krishnamurthy,Tamanna Sagar,Economics,4.72,15,381,536.33975,0.884422,0
6630,Ekantika Krishnamurthy,Warda Kunda,Economics,4.72,15,381,793.025424,0.884422,0
6631,Ekantika Krishnamurthy,Noah Sinha,Economics,4.72,15,381,736.853903,0.884422,0
6632,Ekantika Krishnamurthy,Jeet Prakash,Economics,4.72,15,381,247.622553,0.884422,0
6633,Ekantika Krishnamurthy,Vedhika Garde,Economics,4.72,15,381,795.082104,0.884422,0


In [22]:
combined.to_csv("tutor_student_combined.csv", index=False)
print("Feature-engineered dataset saved as tutor_student_combined.csv")


Feature-engineered dataset saved as tutor_student_combined.csv


##### Conclusion: In this combined dataset, we have mapped tutors to students based on subject interest, calculated distances between them, and engineered features like tutor scores and proximity flags to facilitate better matching.