In [17]:
import pandas as pd
import numpy as np

In [18]:
degree_types = ['B.Tech', 'M.Tech', 'BCA', 'MCA', 'B.Sc', 'M.Sc', 'PhD']
specializations = ['Data Science', 'Web Development', 'Cloud Computing', 'AI/ML', 'Cybersecurity','App Development' ,'None']
communication_level = np.arange(1, 6)
technical_skill_level = np.arange(1, 6)

In [19]:
np.random.seed(42)
num_samples = 10000

In [20]:
data = {
    'degree_type': np.random.choice(degree_types, num_samples),
    'technical_skills': np.random.randint(1, 21, num_samples),
    'soft_skills': np.random.randint(1, 11, num_samples),
    'cgpa': np.round(np.random.uniform(5.0, 10.0, num_samples), 2),
    'projects': np.random.randint(0, 11, num_samples),
    'internships': np.random.randint(0, 6, num_samples),
    'communication_level': np.random.choice(communication_level, num_samples),
    'technical_skill_level': np.random.choice(technical_skill_level, num_samples),
    'years_of_experience': np.random.randint(0, 6, num_samples),
    'specialization': np.random.choice(specializations, num_samples)
}

In [21]:
df = pd.DataFrame(data)

In [22]:
def calculate_package(row):
    base = np.random.uniform(3, 8)  # Base package between 3 to 8 LPA
    cgpa_weight = row['cgpa'] * 0.6  # CGPA has a significant weight
    tech_skill_weight = row['technical_skill_level'] * 0.4  # Tech skill level matters
    internship_bonus = row['internships'] * 0.5  # Each internship adds some value
    project_bonus = row['projects'] * 0.2  # Projects matter but less than internships
    communication_bonus = row['communication_level'] * 0.3
    specialization_bonus = 1 if row['specialization'] != 'None' else 0
    
    total_package = base + cgpa_weight + tech_skill_weight + internship_bonus + project_bonus + communication_bonus + specialization_bonus
    return np.round(total_package, 2)

In [23]:
df['package'] = df.apply(calculate_package, axis=1)

In [24]:
print(df.head())

  degree_type  technical_skills  soft_skills  cgpa  projects  internships  \
0         PhD                 2            8  7.53         4            0   
1         MCA                 7           10  8.22         7            4   
2        B.Sc                 9            9  9.55         0            1   
3         PhD                10            1  8.54         1            2   
4         BCA                18            1  5.23         6            2   

   communication_level  technical_skill_level  years_of_experience  \
0                    5                      3                    2   
1                    2                      4                    2   
2                    5                      3                    1   
3                    2                      5                    3   
4                    5                      5                    2   

    specialization  package  
0  App Development    13.82  
1  App Development    19.18  
2    Cybersecurity    13.1

In [25]:
# Save to CSV
df.to_csv('candidate_salary_data.csv', index=False)