In [None]:
!pip install faker #library to generate synthetic data

Collecting faker
  Downloading faker-37.12.0-py3-none-any.whl.metadata (15 kB)
Downloading faker-37.12.0-py3-none-any.whl (2.0 MB)
   ---------------------------------------- 0.0/2.0 MB ? eta -:--:--
   ------------------------------------- -- 1.8/2.0 MB 15.8 MB/s eta 0:00:01
   ---------------------------------------- 2.0/2.0 MB 14.5 MB/s eta 0:00:00
Installing collected packages: faker
Successfully installed faker-37.12.0


In [None]:
import pandas as pd
import numpy as np
from faker import Faker
import random

fake = Faker("en_IN")

# Defining some sample subjects
subjects = ["Math", "Science", "English", "Physics", "Chemistry", "Biology", "Economics", "History", "Computer Science"]

# --- Removed Delhi base location ---

# Define a dictionary of major cities in UP and Bihar with their coordinates
city_locations = {
    # In Uttar Pradesh
    "Lucknow": (26.8467, 80.9462),
    "Kanpur": (26.4499, 80.3319),
    "Agra": (27.1767, 78.0081),
    "Varanasi": (25.3176, 82.9739),
    "Prayagraj": (25.4358, 81.8463),
    "Meerut": (28.9845, 77.7064),
    "Ghaziabad": (28.6692, 77.4538),
    "Noida": (28.5355, 77.3910),
    
    # In Bihar
    "Patna": (25.5941, 85.1376),
    "Gaya": (24.7960, 85.0036),
    "Muzaffarpur": (26.1205, 85.3647),
    "Bhagalpur": (25.2424, 86.9844),
    "Darbhanga": (26.1550, 85.8988)
}

def get_random_city_location(city_map):
    # Pick a random city name from the dictionary keys
    city_name = random.choice(list(city_map.keys()))
    
    # coordinates for that city
    base_lat, base_long = city_map[city_name]
    
    # Add small random distances
    lat = base_lat + np.random.uniform(-0.05, 0.05)
    long = base_long + np.random.uniform(-0.05, 0.05)
    
    return city_name, lat, long

# Generate Tutor Data
tutor_data = []
for i in range(200):  # 200 tutors
    city, lat, long = get_random_city_location(city_locations)
    
    tutor_data.append({
        "tutor_id": f"T{i+1:03d}",
        "tutor_name": fake.name(),
        "subject": random.choice(subjects),
        "experience_years": random.randint(1, 15),
        "rating": round(random.uniform(3.0, 5.0), 2),
        "hourly_rate": random.randint(200, 1000),
        "location": city, 
        "location_lat": lat,
        "location_long": long
    })

tutor_df = pd.DataFrame(tutor_data)

# Generate Student Data
student_data = []
for i in range(300):  # 300 students
    city, lat, long = get_random_city_location(city_locations)
    
    student_data.append({
        "student_id": f"S{i+1:03d}",
        "student_name": fake.name(),
        "required_subject": random.choice(subjects),
        "grade": random.randint(6, 12),
        "location_lat": lat,
        "location_long": long
    })

student_df = pd.DataFrame(student_data)

print("Tutor Data")
print(tutor_df.head())
print("\n")
print("Student Data")
print(student_df.head())

--- Tutor DataFrame ---
  tutor_id      tutor_name           subject  experience_years  rating  \
0     T001      Ekaja Dyal           Science                12    3.19   
1     T002  Mohammed Jaggi           Science                 4    4.96   
2     T003      Bhavna Lad           Science                 7    3.61   
3     T004      Leena Behl  Computer Science                15    3.27   
4     T005  Zinal Raghavan         Economics                15    4.65   

   hourly_rate     location  location_lat  location_long  
0          293        Noida     28.502147      77.393178  
1          847  Muzaffarpur     26.151310      85.380346  
2          721        Patna     25.590571      85.186048  
3          504         Agra     27.188354      77.984473  
4          894    Bhagalpur     25.199162      86.982720  


--- Student DataFrame ---
  student_id   student_name  required_subject  grade  location_lat  \
0       S001  Mekhala Divan  Computer Science      7     24.827308   
1       S

In [None]:
# Save the created datasets to CSV files
tutor_df.to_csv("tutors.csv", index=False)
student_df.to_csv("students.csv", index=False)

print("Datasets generated successfully: tutors.csv & students.csv")



✅ Datasets generated successfully: tutors.csv & students.csv


In [12]:
tutor_df.head(10)

Unnamed: 0,tutor_id,tutor_name,subject,experience_years,rating,hourly_rate,location,location_lat,location_long
0,T001,Ekaja Dyal,Science,12,3.19,293,Noida,28.502147,77.393178
1,T002,Mohammed Jaggi,Science,4,4.96,847,Muzaffarpur,26.15131,85.380346
2,T003,Bhavna Lad,Science,7,3.61,721,Patna,25.590571,85.186048
3,T004,Leena Behl,Computer Science,15,3.27,504,Agra,27.188354,77.984473
4,T005,Zinal Raghavan,Economics,15,4.65,894,Bhagalpur,25.199162,86.98272
5,T006,Chatura Jain,Economics,9,3.73,926,Noida,28.54708,77.378654
6,T007,Wahab Wable,Computer Science,11,4.44,962,Gaya,24.763356,84.972396
7,T008,Zayan Sarin,Science,7,3.8,320,Prayagraj,25.43146,81.827566
8,T009,Shaurya Dhillon,Physics,7,4.24,384,Lucknow,26.844964,80.929672
9,T010,Gabriel Vaidya,Biology,1,4.09,452,Patna,25.643455,85.177179


In [None]:
student_df.head(10)