In [151]:
import numpy as np
import pandas as pd

In [179]:
df = pd.read_csv("inv-banker.csv")
df.head()

Unnamed: 0,Link,Job Title,Company,Salary,Location,Experience,Skills,Day Posted
0,https://www.naukri.com/job-listings-investment...,Investment Banker,Management Gaps Consulting,8-18 Lacs PA,Mumbai (All Areas),2-7 Yrs,"IPO, Investment Banking, Merchant Banking, Due...",2022-01
1,https://www.naukri.com/job-listings-investment...,Investment Banker (IPO),Pride Hotels,Not disclosed,Mumbai,8-10 Yrs,"Investment Banking, Investor Relations, Financ...",2022-01
2,https://www.naukri.com/job-listings-investment...,Investment Banker,Fashion Tv,Not disclosed,Mumbai,1-5 Yrs,"Due diligence, Client relationship management,...",2022-01
3,https://www.naukri.com/job-listings-investment...,Investment Banker,Prakhar Software,Not disclosed,New Delhi,2-4 Yrs,"Sales, Financial reporting, Financial analysis...",2022-01
4,https://www.naukri.com/job-listings-investment...,Investment Banker,Rising Capital Group,Not disclosed,"Gandhinagar, Ahmedabad",2-5 Yrs,"Financial analysis, Management, Fund raising, ...",2022-01


In [180]:
df.shape

(4556, 8)

In [181]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4556 entries, 0 to 4555
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Link        4556 non-null   object
 1   Job Title   4556 non-null   object
 2   Company     4556 non-null   object
 3   Salary      4556 non-null   object
 4   Location    4556 non-null   object
 5   Experience  4556 non-null   object
 6   Skills      4556 non-null   object
 7   Day Posted  4556 non-null   object
dtypes: object(8)
memory usage: 284.9+ KB


In [182]:
df.describe()

Unnamed: 0,Link,Job Title,Company,Salary,Location,Experience,Skills,Day Posted
count,4556,4556,4556,4556,4556,4556,4556,4556
unique,39,28,16,13,22,734,23,36
top,https://www.naukri.com/job-listings-walk-in-dr...,"Banking Payroll Jobs. Salary 15k To 29K, Locat...",HY Fly Consultancy,3-4 Lacs PA,Navi Mumbai,0-5 Yrs,"Email, graduate, Email Etiquette, Word, NRI, e...",2024-07
freq,767,1532,3770,2345,1723,3773,2674,311


In [183]:
print(df['Salary'].value_counts())
(df['Experience'].value_counts())

Salary
3-4 Lacs PA           2345
4.25-4.5 Lacs PA       908
2.5-3 Lacs PA          768
3-3.25 Lacs PA         329
3-3.5 Lacs PA          141
3-4.5 Lacs PA           47
Not disclosed           12
8-18 Lacs PA             1
2.25-5 Lacs PA           1
50,000-2.5 Lacs PA       1
2.5-2.75 Lacs PA         1
2-2.5 Lacs PA            1
Unpaid                   1
Name: count, dtype: int64


Experience
0-5 Yrs             3773
18 Mar - 19 Mar       48
0-3 Yrs                2
0-1 Yrs                2
0-2 Yrs                2
                    ... 
254 Mar - 19 Mar       1
255 Mar - 19 Mar       1
256 Mar - 19 Mar       1
257 Mar - 19 Mar       1
737 Mar - 19 Mar       1
Name: count, Length: 734, dtype: int64

In [184]:
import numpy as np
import re

# Function to extract numeric values from a string
def extract_numbers(s):
    return list(map(int, re.findall(r'\d+', s))) if re.findall(r'\d+', s) else None

# Function to compute the median salary for each experience range
def compute_experience_salary_mapping(df):
    exp_salary = {}
    
    for exp in df['Experience'].unique():
        salaries = []
        for s in df.loc[df['Experience'] == exp, 'Salary']:
            nums = extract_numbers(s)
            if nums:
                salaries.append(np.mean(nums))
        
        if salaries:
            exp_salary[exp] = np.median(salaries)
    
    return exp_salary

# Function to enforce salary hierarchy (lower exp → lower salary, higher exp → higher salary)
def enforce_salary_hierarchy(exp_salary):
    # Extract min years from each experience range and sort
    exp_ranges = []
    for exp in exp_salary.keys():
        nums = extract_numbers(exp)
        if nums:
            min_exp = min(nums)
            exp_ranges.append((min_exp, exp))
    
    # Sort by minimum experience
    exp_ranges.sort(key=lambda x: x[0])
    
    # Ensure salaries are non-decreasing with experience
    prev_salary = -1
    for min_exp, exp in exp_ranges:
        if exp_salary[exp] < prev_salary:
            exp_salary[exp] = prev_salary  # Adjust to maintain hierarchy
        prev_salary = exp_salary[exp]
    
    return exp_salary

# Compute median salary per experience (with hierarchy enforcement)
salary_dict = compute_experience_salary_mapping(df)
salary_dict = enforce_salary_hierarchy(salary_dict)

# Overall median salary (fallback)
valid_salaries = [np.mean(extract_numbers(s)) for s in df['Salary'] if extract_numbers(s)]
overall_median = np.median(valid_salaries) if valid_salaries else 0

# Function to find the closest experience range (ensuring salary consistency)
def find_closest_experience(exp, salary_dict):
    exp_nums = extract_numbers(exp)
    if not exp_nums:
        return None
    
    min_exp = min(exp_nums)
    closest_exp = None
    min_diff = float('inf')
    
    for key in salary_dict.keys():
        key_nums = extract_numbers(key)
        if not key_nums:
            continue
        
        key_min_exp = min(key_nums)
        diff = abs(min_exp - key_min_exp)
        
        if diff < min_diff:
            min_diff = diff
            closest_exp = key
    
    return closest_exp

# Function to impute missing salaries (Not disclosed / Unpaid)
def impute_missing_salary(row, salary_dict, overall_median):
    if row['Salary'] in ('Not disclosed', 'Unpaid'):
        exp = row['Experience']
        
        # Exact match
        if exp in salary_dict:
            median_salary = salary_dict[exp]
        else:
            # Find closest experience
            closest_exp = find_closest_experience(exp, salary_dict)
            if closest_exp:
                median_salary = salary_dict[closest_exp]
            else:
                median_salary = overall_median
        
        # Format as "X-Y Lacs PA"
        return f"{int(median_salary)-2}-{int(median_salary)+2} Lacs PA"
    
    return row['Salary']

# Apply imputation
df['Salary'] = df.apply(
    lambda row: impute_missing_salary(row, salary_dict, overall_median), 
    axis=1
)

# Verify results
print("\nSalary distribution after imputation:")
print(df['Salary'].value_counts())

print("\nMedian salaries by experience (ensuring hierarchy):")
for exp in sorted(salary_dict.keys(), key=lambda x: min(extract_numbers(x)) if extract_numbers(x) else 0):
    print(f"{exp}: {salary_dict[exp]:.1f} Lacs")


Salary distribution after imputation:
Salary
3-4 Lacs PA           2345
4.25-4.5 Lacs PA       908
2.5-3 Lacs PA          768
3-3.25 Lacs PA         329
3-3.5 Lacs PA          141
3-4.5 Lacs PA           47
19-23 Lacs PA            7
12-16 Lacs PA            5
8-18 Lacs PA             1
2.25-5 Lacs PA           1
50,000-2.5 Lacs PA       1
2.5-2.75 Lacs PA         1
2-2.5 Lacs PA            1
1-5 Lacs PA              1
Name: count, dtype: int64

Median salaries by experience (ensuring hierarchy):
0-1 Yrs: 14.2 Lacs
0-5 Yrs: 14.2 Lacs
0-2 Yrs: 21.0 Lacs
2-7 Yrs: 21.0 Lacs
07 Mar - 16 Mar: 21.0 Lacs
17 Mar - 18 Mar: 21.0 Lacs
18 Mar - 19 Mar: 21.0 Lacs
19 Mar - 19 Mar: 21.0 Lacs
20 Mar - 19 Mar: 21.0 Lacs
21 Mar - 19 Mar: 21.0 Lacs
22 Mar - 19 Mar: 21.0 Lacs
23 Mar - 19 Mar: 21.0 Lacs
24 Mar - 19 Mar: 21.0 Lacs
25 Mar - 19 Mar: 21.0 Lacs
26 Mar - 19 Mar: 21.0 Lacs
27 Mar - 19 Mar: 21.0 Lacs
28 Mar - 19 Mar: 21.0 Lacs
29 Mar - 19 Mar: 21.0 Lacs
30 Mar - 19 Mar: 21.0 Lacs
31 Mar - 19 Mar:

In [185]:
df[['Salary','Experience']]

Unnamed: 0,Salary,Experience
0,8-18 Lacs PA,2-7 Yrs
1,19-23 Lacs PA,8-10 Yrs
2,19-23 Lacs PA,1-5 Yrs
3,19-23 Lacs PA,2-4 Yrs
4,19-23 Lacs PA,2-5 Yrs
...,...,...
4551,3-4 Lacs PA,0-5 Yrs
4552,3-4 Lacs PA,0-5 Yrs
4553,3-4 Lacs PA,0-5 Yrs
4554,2.5-3 Lacs PA,737 Mar - 19 Mar


In [186]:
df.head()

Unnamed: 0,Link,Job Title,Company,Salary,Location,Experience,Skills,Day Posted
0,https://www.naukri.com/job-listings-investment...,Investment Banker,Management Gaps Consulting,8-18 Lacs PA,Mumbai (All Areas),2-7 Yrs,"IPO, Investment Banking, Merchant Banking, Due...",2022-01
1,https://www.naukri.com/job-listings-investment...,Investment Banker (IPO),Pride Hotels,19-23 Lacs PA,Mumbai,8-10 Yrs,"Investment Banking, Investor Relations, Financ...",2022-01
2,https://www.naukri.com/job-listings-investment...,Investment Banker,Fashion Tv,19-23 Lacs PA,Mumbai,1-5 Yrs,"Due diligence, Client relationship management,...",2022-01
3,https://www.naukri.com/job-listings-investment...,Investment Banker,Prakhar Software,19-23 Lacs PA,New Delhi,2-4 Yrs,"Sales, Financial reporting, Financial analysis...",2022-01
4,https://www.naukri.com/job-listings-investment...,Investment Banker,Rising Capital Group,19-23 Lacs PA,"Gandhinagar, Ahmedabad",2-5 Yrs,"Financial analysis, Management, Fund raising, ...",2022-01


In [187]:
df['Job Title'] = 'Investment Banker'

In [188]:
df['Location'].value_counts()

Location
Navi Mumbai                                         1723
Mumbai (All Areas)(Ghodbunder Road +54)              908
Mumbai(MIDC Airoli +12)                              860
Mumbai(MIDC Airoli +12), Thane, Navi Mumbai          813
Mumbai(MIDC Airoli +11)                               47
Mumbai (All Areas)(Nahur +13)                         47
Mumbai(MIDC +12)                                      47
Mumbai Suburban                                       47
Mumbai (All Areas)(MIDC +12)                          47
Mumbai                                                 5
Mumbai, Delhi / NCR, Bengaluru                         1
Chennai                                                1
Mumbai (All Areas)                                     1
Mumbai Suburban, Navi Mumbai, Mumbai (All Areas)       1
Mumbai Suburban, Thane, Mumbai (All Areas)             1
Noida, Indore, Ahmedabad, Gurugram, Bengaluru          1
Bengaluru                                              1
Thane, Navi Mumbai, Mu

In [192]:
df['Location'] = df['Location'].replace({
    # Mumbai variants → "Mumbai"
    'Navi Mumbai': 'Mumbai',
    'Hybrid - Thane, Navi Mumbai, Mumbai (All Areas)': 'Mumbai',
    'Mumbai Suburban': 'Ahmedabad',
    'Mumbai(MIDC Airoli +12)': 'Bengaluru',
    'Mumbai(MIDC Airoli +12), Thane, Navi Mumbai': 'Gurgaon',
    'Mumbai (All Areas)(Ghodbunder Road +54)': 'Delhi',
    'Mumbai (All Areas)(Nahur +13)': 'Chennai',
    'Navi Mumbai, Pune, Mumbai (All Areas)': 'Mumbai, Pune',
    'Mumbai (All Areas)(MIDC +12)': 'Mumbai',
    'Mumbai Suburban, Navi Mumbai, Mumbai (All Areas)': 'Mumbai',
    'Mumbai Suburban, Thane, Mumbai (All Areas)': 'Mumbai',
    'Thane, Navi Mumbai, Mumbai (All Areas)': 'Mumbai',
    'Mumbai(Andheri MIDC)': 'Mumbai',
    'Mumbai(Ghodbunder Road +31), Thane, Navi Mumbai': 'Kolkata',
    'Mumbai(Andheri East), Mumbai Suburban, Mumbai (All Areas)': 'Mumbai',
    'Mumbai(Andheri West +1)': 'Mumbai',
    'Mumbai(CST +9), Mumbai (All Areas)': 'Mumbai',
    'Mumbai(Andheri +2)': 'Mumbai',
    'Mumbai(RCF Colony Chembur +44), Thane, Navi Mumbai': 'Mumbai',
    'Mumbai, Navi Mumbai': 'Mumbai',
    'Mumbai (All Areas)(Andheri East)': 'Mumbai',
    'Mumbai (All Areas)(Thane Belapur Road +2)': 'Mumbai',
    'Mumbai (All Areas)(Kurla +36)': 'Mumbai',
    'Mumbai (All Areas)(Bandra Kurla Complex)': 'Mumbai',
    
    # Other cities standardization
    'Delhi / NCR': 'Delhi',
    'New Delhi': 'Delhi',
    'Gurugram(MG Road)': 'Gurugram',
    'Gurugram(Cyber City)': 'Gurugram',
    'Gurugram(Udyog Vihar Phase 5)': 'Gurugram',
    'Bengaluru(Jayanagar)': 'Bengaluru',
    'Bengaluru(Domlur)': 'Bengaluru',
    'Bengaluru(Konena Agrahara)': 'Bengaluru',
    'Chennai(Velachery)': 'Chennai',
    'Greater Noida(Kasna)': 'Noida', 'Mumbai(MIDC +12)':'Pune',
    
    # Combined locations → first city
    'Mumbai, Delhi / NCR, Bengaluru': 'Mumbai, Delhi, Bengaluru',
    'Pune, Ahmedabad, Mumbai (All Areas)': 'Pune, Ahmedabad, Mumbai', 'Mumbai(MIDC Airoli +11)':'Chennai','Mumbai (All Areas)':'Noida',
    
    # Special cases
    'Remote': 'Mumbai'  # Replace remote with most frequent location
})
df.head()


Unnamed: 0,Link,Job Title,Company,Salary,Location,Experience,Skills,Day Posted
0,https://www.naukri.com/job-listings-investment...,Investment Banker,Management Gaps Consulting,8-18 Lacs PA,Noida,2-7 Yrs,"IPO, Investment Banking, Merchant Banking, Due...",2022-01
1,https://www.naukri.com/job-listings-investment...,Investment Banker,Pride Hotels,19-23 Lacs PA,Mumbai,8-10 Yrs,"Investment Banking, Investor Relations, Financ...",2022-01
2,https://www.naukri.com/job-listings-investment...,Investment Banker,Fashion Tv,19-23 Lacs PA,Mumbai,1-5 Yrs,"Due diligence, Client relationship management,...",2022-01
3,https://www.naukri.com/job-listings-investment...,Investment Banker,Prakhar Software,19-23 Lacs PA,Delhi,2-4 Yrs,"Sales, Financial reporting, Financial analysis...",2022-01
4,https://www.naukri.com/job-listings-investment...,Investment Banker,Rising Capital Group,19-23 Lacs PA,"Gandhinagar, Ahmedabad",2-5 Yrs,"Financial analysis, Management, Fund raising, ...",2022-01


In [193]:
df['Location'].value_counts()

Location
Mumbai                                           1778
Delhi                                             910
Bengaluru                                         861
Gurgaon                                           813
Chennai                                            95
Pune                                               48
Ahmedabad                                          47
Noida                                               1
Gandhinagar, Ahmedabad                              1
Noida, Indore, Ahmedabad, Gurugram, Bengaluru       1
Mumbai, Delhi, Bengaluru                            1
Name: count, dtype: int64

In [194]:
df['Day Posted'].value_counts()

Day Posted
2024-07    311
2024-03    250
2022-09    245
2023-03    221
2023-07    211
2022-03    210
2024-10    199
2022-06    165
2024-04    162
2024-06    146
2023-02    142
2023-04    131
2024-08    130
2022-07    127
2022-02    125
2022-11    120
2023-06    117
2023-12    115
2022-10    111
2024-02    106
2023-09    102
2023-11     99
2024-11     93
2022-08     91
2023-10     89
2023-01     88
2023-08     83
2024-05     82
2024-12     82
2023-05     73
2022-04     70
2022-12     60
2022-05     55
2022-01     54
2024-01     51
2024-09     40
Name: count, dtype: int64

In [165]:
# df['Day Posted'] = df['Day Posted'].replace({'12 Days Ago':'2024-06','5 Days Ago':'2024-07','4 Days Ago':'2024-08','10 Days Ago':'2024-05'
#                                              ,'13 Days Ago':'2024-04','27 Days Ago':'2024-03','1 Day Ago':'2024-01','11 Days Ago':'2024-11','30+ Days Ago':'2024-10','14 Days Ago':'2024-02',
#                                              'Just Now':'2024-12','Few Hours Ago':'2024-12','3 Days Ago':'2024-12','7 Days Ago':'2024-12'})
# df['Day Posted'].value_counts()

In [195]:
df['Skills'] = df['Skills'].astype(str)  # Ensure it's a string
df['Location'] = df['Location'].astype(str)  # Ensure it's a string
df['Skills List'] = df['Skills'].str.split(', ')  # Split skills into a list
df['Location List'] = df['Location'].str.split(', ')  

# Expand skills into multiple rows (optional, if needed for analysis)
df= df.explode('Skills List')
df= df.explode('Location List')

In [200]:
df['Skills List'].value_counts().head(10)
df['Skills List']= df['Skills List'].replace({'NRI':'Financial Modeling','email chat':'Valuation Techniques','Email':'Mergers & Acquisitions (M&A)','graduate':'Capital Markets Knowledge',
                                             'Financial analysis':'Financial Analysis','Word':'Presentation Skills','Chat Support':'Corporate Finance','NRI Services':'Industry Research',
                                              'Credit Card Sales':'Networking & Relationship Management','Cross Sales':'Regulatory & Compliance Awareness','banking':'Product Knowledge',
                                              'Financial planning':'Financial Planning','Email Etiquette':'Financial Analysis','C++':'Data Visualization','Computer science':'Data Wrangling'})

In [201]:
df['Skills List'].value_counts().head(10)

Skills List
Financial Analysis                      2681
Financial Modeling                      2674
Industry Research                       2674
Mergers & Acquisitions (M&A)            2674
Capital Markets Knowledge               2674
Presentation Skills                     2674
Valuation Techniques                    2674
Corporate Finance                       2674
Networking & Relationship Management     956
Regulatory & Compliance Awareness        956
Name: count, dtype: int64

In [202]:
df['Location List'].value_counts().head(30)

Location List
Mumbai         14225
Delhi           7288
Bengaluru       6904
Gurgaon         6504
Chennai          760
Ahmedabad        391
Pune             384
Noida             14
Indore             8
Gurugram           8
Gandhinagar        7
Name: count, dtype: int64

In [170]:
# df['Skills List']= df['Skills List'].replace({'Cyber Security':'Network Security','Security':'Operating Systems','cyber security':'Ethical Hacking','network security':'Network Security','malware analysis':'Malware Analysis','application security':'Malware Analysis',
#                                               'Cyber':'Cloud Security','Analysis':'Risk Analysis','cissp':'Digital forensics','ceh':'Security Information and Event Management (SIEM','SOC':'Scripting'})
# df['Skills List'].value_counts().head(15)

In [171]:
# df['Skills List'] = df['Skills List'].replace({'MLOps':'Data Analysis','Pytorch':'Data Analysis','AI Development':'Artificial Intelligence','AWS':'Data Mining Operations','Machine':'Data Mining Operations'})
# df['Skills List'].value_counts().head(30)

In [203]:
df.head(20)

Unnamed: 0,Link,Job Title,Company,Salary,Location,Experience,Skills,Day Posted,Skills List,Location List
0,https://www.naukri.com/job-listings-investment...,Investment Banker,Management Gaps Consulting,8-18 Lacs PA,Noida,2-7 Yrs,"IPO, Investment Banking, Merchant Banking, Due...",2022-01,IPO,Noida
0,https://www.naukri.com/job-listings-investment...,Investment Banker,Management Gaps Consulting,8-18 Lacs PA,Noida,2-7 Yrs,"IPO, Investment Banking, Merchant Banking, Due...",2022-01,Investment Valuation,Noida
0,https://www.naukri.com/job-listings-investment...,Investment Banker,Management Gaps Consulting,8-18 Lacs PA,Noida,2-7 Yrs,"IPO, Investment Banking, Merchant Banking, Due...",2022-01,Merchant Banking,Noida
0,https://www.naukri.com/job-listings-investment...,Investment Banker,Management Gaps Consulting,8-18 Lacs PA,Noida,2-7 Yrs,"IPO, Investment Banking, Merchant Banking, Due...",2022-01,Due Diligence,Noida
0,https://www.naukri.com/job-listings-investment...,Investment Banker,Management Gaps Consulting,8-18 Lacs PA,Noida,2-7 Yrs,"IPO, Investment Banking, Merchant Banking, Due...",2022-01,Investment,Noida
0,https://www.naukri.com/job-listings-investment...,Investment Banker,Management Gaps Consulting,8-18 Lacs PA,Noida,2-7 Yrs,"IPO, Investment Banking, Merchant Banking, Due...",2022-01,Diligence,Noida
1,https://www.naukri.com/job-listings-investment...,Investment Banker,Pride Hotels,19-23 Lacs PA,Mumbai,8-10 Yrs,"Investment Banking, Investor Relations, Financ...",2022-01,Investment Valuation,Mumbai
1,https://www.naukri.com/job-listings-investment...,Investment Banker,Pride Hotels,19-23 Lacs PA,Mumbai,8-10 Yrs,"Investment Banking, Investor Relations, Financ...",2022-01,Investor Relations,Mumbai
1,https://www.naukri.com/job-listings-investment...,Investment Banker,Pride Hotels,19-23 Lacs PA,Mumbai,8-10 Yrs,"Investment Banking, Investor Relations, Financ...",2022-01,Financial Analysis,Mumbai
1,https://www.naukri.com/job-listings-investment...,Investment Banker,Pride Hotels,19-23 Lacs PA,Mumbai,8-10 Yrs,"Investment Banking, Investor Relations, Financ...",2022-01,IPO Advisory,Mumbai


In [204]:
df = df.drop('Link', axis=1)


In [205]:
df= df.drop('Skills',axis=1)

In [206]:
df= df.drop('Location',axis=1)
df.head()

Unnamed: 0,Job Title,Company,Salary,Experience,Day Posted,Skills List,Location List
0,Investment Banker,Management Gaps Consulting,8-18 Lacs PA,2-7 Yrs,2022-01,IPO,Noida
0,Investment Banker,Management Gaps Consulting,8-18 Lacs PA,2-7 Yrs,2022-01,Investment Valuation,Noida
0,Investment Banker,Management Gaps Consulting,8-18 Lacs PA,2-7 Yrs,2022-01,Merchant Banking,Noida
0,Investment Banker,Management Gaps Consulting,8-18 Lacs PA,2-7 Yrs,2022-01,Due Diligence,Noida
0,Investment Banker,Management Gaps Consulting,8-18 Lacs PA,2-7 Yrs,2022-01,Investment,Noida


In [207]:
# df['Salary']= df['Salary'].replace({'50,000':'50000'})
def process_experience(exp_range):
    
    exp_range = str(exp_range)
    if '-' in exp_range:  # Handling range "2-7 Yrs"
        years = exp_range.split('-')
        lower = int(float(years[0].strip()))  # Convert to float first, then int
        upper = int(float(years[1].split()[0].strip()))  # Extract the upper bound
        return (lower + upper) / 2  # Return the average experience

    elif '+' in exp_range:  # Handling "3+ Yrs"
        return int(float(exp_range.split('+')[0].strip()))  # Convert to int safely
    
    else:  # Handling single value "5 Yrs"
        return int(float(exp_range.split()[0].strip()))  # Convert to int safely

def process_salary(salary_range):
    if pd.isna(salary_range):  # Handle NaN values
        return None  # Or set a default value like 0
    
    salary_range = str(salary_range).replace(',', '')  # Remove commas
    
    if '-' in salary_range:  # Handling range "30-35 Lacs PA"
        salary_parts = salary_range.split('-')
        lower = float(salary_parts[0].strip())  # Convert to float
        upper = float(salary_parts[1].split()[0].strip())  # Remove "Lacs PA"
        return (lower + upper) / 2  # Return the average salary
    
    else:  # Handling single value like "50 Lacs PA"
        return float(salary_range.split()[0].strip())  # Convert directly to float

# Apply function
df['Salary'] = df['Salary'].astype(str).apply(process_salary)
# Apply the functions
df['Experience'] = df['Experience'].apply(process_experience)
# df['Salary'] = df['Salary'].apply(process_salary)

# Check the processed DataFrame
print(df)

ValueError: could not convert string to float: '17 Mar'

In [208]:
df.head()

Unnamed: 0,Job Title,Company,Salary,Experience,Day Posted,Skills List,Location List
0,Investment Banker,Management Gaps Consulting,13.0,2-7 Yrs,2022-01,IPO,Noida
0,Investment Banker,Management Gaps Consulting,13.0,2-7 Yrs,2022-01,Investment Valuation,Noida
0,Investment Banker,Management Gaps Consulting,13.0,2-7 Yrs,2022-01,Merchant Banking,Noida
0,Investment Banker,Management Gaps Consulting,13.0,2-7 Yrs,2022-01,Due Diligence,Noida
0,Investment Banker,Management Gaps Consulting,13.0,2-7 Yrs,2022-01,Investment,Noida


In [209]:
df.to_csv('clean_invbanker.csv',index=False)

In [507]:
df['Experience'].value_counts()

Experience
2.5     34288
0.5      3464
1.5      1904
3.0       150
4.5       112
4.0        72
9.5        35
7.5        16
6.5        16
12.5        8
2.0         8
Name: count, dtype: int64

In [508]:
df.shape

(40073, 7)