In [1]:
import pandas as pd

In [2]:
%load_ext numpy_html

In [3]:
%%html
<style>
table {margin-top: 16pt;}
td {font-family: Arial, Helvetica, sans-serif !important; font-weight: 600; white-space: normal !important;}
tr:nth-child(even) {background: #b0d9eb !important; color: black}
tr:nth-child(odd) {background: #8fb1bf !important; color: black}
</style>

In [4]:
# Read the Excel file into a DataFrame
df = pd.read_csv("data_salaries_india.csv")
df.head()

Unnamed: 0,Company Name,Job Title,Role,Salaries Reported,Location,Salary
0,Axis Bank,Data Scientist,Specialist,3,Bangalore,"₹11,71,687/yr"
1,Globex Digital Solutions,Machine Learning Engineer,Manager,1,Hyderabad,"₹32,57,305/yr"
2,Tempworks Services,Data Scientist,Manager,1,Bangalore,"₹58,11,205/yr"
3,BlueOptima,Machine Learning Engineer,Specialist,2,Bangalore,"₹11,80,965/yr"
4,Dell Technologies,Data Analyst,Specialist,5,New Delhi,"₹6,02,930/yr"


In [5]:
df['Salary'] = df['Salary'].str.replace('₹', '')
df['Salary'] = df['Salary'].str.replace('/yr', '')
df['Salary'] = df['Salary'].str.replace(',', '')
df.head()

Unnamed: 0,Company Name,Job Title,Role,Salaries Reported,Location,Salary
0,Axis Bank,Data Scientist,Specialist,3,Bangalore,1171687
1,Globex Digital Solutions,Machine Learning Engineer,Manager,1,Hyderabad,3257305
2,Tempworks Services,Data Scientist,Manager,1,Bangalore,5811205
3,BlueOptima,Machine Learning Engineer,Specialist,2,Bangalore,1180965
4,Dell Technologies,Data Analyst,Specialist,5,New Delhi,602930


In [6]:
df['Salary'] = df['Salary'].astype(int)
df['Monthly Salary'] = df['Salary'] / 12
df['Hourly Salary'] = df['Salary'] / 2080
df.head()

Unnamed: 0,Company Name,Job Title,Role,Salaries Reported,Location,Salary,Monthly Salary,Hourly Salary
0,Axis Bank,Data Scientist,Specialist,3,Bangalore,1171687,97640.583333,563.311058
1,Globex Digital Solutions,Machine Learning Engineer,Manager,1,Hyderabad,3257305,271442.083333,1566.012019
2,Tempworks Services,Data Scientist,Manager,1,Bangalore,5811205,484267.083333,2793.848558
3,BlueOptima,Machine Learning Engineer,Specialist,2,Bangalore,1180965,98413.75,567.771635
4,Dell Technologies,Data Analyst,Specialist,5,New Delhi,602930,50244.166667,289.870192


In [7]:
common_job_titles = df['Job Title'].value_counts().head(5)
balance_job_titles = df['Job Title'].value_counts(normalize=True)
common_job_titles

Job Title
Data Scientist               2165
Data Analyst                 1167
Data Engineer                 948
Machine Learning Engineer     683
Data Science                   54
Name: count, dtype: int64

In [8]:
common_companies = df['Company Name'].value_counts().head(5)
balance_companies = df['Company Name'].value_counts(normalize=True)
common_companies

Company Name
Tata Consultancy Services    51
Accenture                    39
Amazon                       35
Fresher                      31
First Student                30
Name: count, dtype: int64

In [9]:
common_locations = df['Location'].value_counts().head(5)
balance_locations = df['Location'].value_counts(normalize=True)
common_locations

Location
Bangalore    1858
Pune          966
Hyderabad     786
New Delhi     781
Mumbai        736
Name: count, dtype: int64

In [10]:
df['Job Title'] = df['Job Title'].replace({'Machine Learning Data Associate': 'Machine Learning Associate'})

In [11]:
job_title_counts = df['Job Title'].value_counts()
job_title_counts

Job Title
Data Scientist                              2165
Data Analyst                                1167
Data Engineer                                948
Machine Learning Engineer                    683
Data Science                                  54
Senior Data Scientist                         45
Junior Data Scientist                         22
Senior Machine Learning Engineer              10
Lead Data Scientist                            6
Software Engineer - Machine Learning           3
Machine Learning Engineer/Data Scientist       2
Machine Learning Data Associate II             2
Machine Learning Associate                     2
Machine Learning Developer                     2
Machine Learning Consultant                    2
Machine Learning Scientist                     2
Data Scientist - Trainee                       2
Data Science Consultant                        1
Machine Learning Data Associate I              1
Data Science Associate                         1
National D

In [12]:
company_counts = df['Company Name'].value_counts()
company_counts

Company Name
Tata Consultancy Services    51
Accenture                    39
Amazon                       35
Fresher                      31
First Student                30
                             ..
SMARTe                        1
FSN Ecommerce Ventures        1
Sarvvid-AI                    1
BCA                           1
Rejolut                       1
Name: count, Length: 2523, dtype: int64

In [13]:
location_counts = df['Location'].value_counts()
location_counts

Location
Bangalore    1858
Pune          966
Hyderabad     786
New Delhi     781
Mumbai        736
Name: count, dtype: int64

In [14]:
quantiles = df['Salary'].quantile([0.25, 0.5, 0.75])

# Thresholds for each quartile
q1 = quantiles[0.25]
q2 = quantiles[0.5]
q3 = quantiles[0.75]

# Distributing quartiles
df['Salary Category'] = pd.cut(df['Salary'], bins=[float('-inf'), q1, q2, q3, float('inf')],
                               labels=['Bottom', 'Below Average', 'Above Average', 'Top'])
df.head()

Unnamed: 0,Company Name,Job Title,Role,Salaries Reported,Location,Salary,Monthly Salary,Hourly Salary,Salary Category
0,Axis Bank,Data Scientist,Specialist,3,Bangalore,1171687,97640.583333,563.311058,Above Average
1,Globex Digital Solutions,Machine Learning Engineer,Manager,1,Hyderabad,3257305,271442.083333,1566.012019,Top
2,Tempworks Services,Data Scientist,Manager,1,Bangalore,5811205,484267.083333,2793.848558,Top
3,BlueOptima,Machine Learning Engineer,Specialist,2,Bangalore,1180965,98413.75,567.771635,Above Average
4,Dell Technologies,Data Analyst,Specialist,5,New Delhi,602930,50244.166667,289.870192,Below Average


In [15]:
from sklearn.preprocessing import OneHotEncoder
variables = ['Role']

encoder = OneHotEncoder(sparse_output=False).set_output(transform="pandas")
one_hot_encoded = encoder.fit_transform(df[variables]).astype(int)
df = pd.concat([df,one_hot_encoded],axis=1).drop(columns=variables)

# splited regions into columns

In [16]:
variables = ['Salary Category']

encoder = OneHotEncoder(sparse_output=False).set_output(transform="pandas")
one_hot_encoded = encoder.fit_transform(df[variables]).astype(int)
df = pd.concat([df,one_hot_encoded],axis=1).drop(columns=variables)

# splited regions into columns

In [17]:
variables = ['Job Title']

encoder = OneHotEncoder(sparse_output=False).set_output(transform="pandas")
one_hot_encoded = encoder.fit_transform(df[variables]).astype(int)
df = pd.concat([df,one_hot_encoded],axis=1).drop(columns=variables)

# splited regions into columns

In [18]:
df = df.drop(columns=['Company Name', 'Location', 'Salary', 'Monthly Salary', 'Hourly Salary'])

In [19]:
correlation = df.corr()
correlation.style.highlight_quantile(axis=None, q_left=0.9, color="green")

Unnamed: 0,Salaries Reported,Role_Manager,Role_Specialist,Salary Category_Above Average,Salary Category_Below Average,Salary Category_Bottom,Salary Category_Top,Job Title_Associate Machine Learning Engineer,Job Title_CEO,Job Title_Data Analyst,Job Title_Data Engineer,Job Title_Data Science,Job Title_Data Science Associate,Job Title_Data Science Consultant,Job Title_Data Science Lead,Job Title_Data Science Manager,Job Title_Data Scientist,Job Title_Data Scientist - Trainee,Job Title_Junior Data Scientist,Job Title_Lead Data Scientist,Job Title_Machine Learning Associate,Job Title_Machine Learning Consultant,Job Title_Machine Learning Data Analyst,Job Title_Machine Learning Data Associate I,Job Title_Machine Learning Data Associate II,Job Title_Machine Learning Developer,Job Title_Machine Learning Engineer,Job Title_Machine Learning Engineer/Data Scientist,Job Title_Machine Learning Scientist,Job Title_Machine Learning Software Engineer,Job Title_National Director,Job Title_Senior Data Scientist,Job Title_Senior Machine Learning Engineer,Job Title_Software Engineer - Machine Learning
Salaries Reported,1.0,-0.114675,0.114675,0.017187,0.119996,-0.047511,-0.089668,0.001316,-0.004494,0.169879,-0.011231,-0.030414,0.018747,0.007127,-0.001589,-0.001589,-0.058486,-0.004302,-0.019881,-0.00864,0.079931,-0.006356,0.001316,0.018747,0.001862,-0.006356,-0.094381,-0.006356,-0.006356,-0.004494,-0.004494,-0.026362,-0.000431,-0.007785
Role_Manager,-0.114675,1.0,-1.0,-0.059484,-0.184453,-0.20175,0.445671,-0.006046,0.032268,-0.005287,0.000569,0.012994,-0.006046,-0.006046,-0.006046,-0.006046,-0.009337,0.018544,0.004327,-0.014816,-0.008551,-0.008551,-0.006046,-0.006046,-0.008551,-0.008551,0.017679,0.018544,-0.008551,-0.006046,0.032268,-0.006314,-0.007008,0.011652
Role_Specialist,0.114675,-1.0,1.0,0.059484,0.184453,0.20175,-0.445671,0.006046,-0.032268,0.005287,-0.000569,-0.012994,0.006046,0.006046,0.006046,0.006046,0.009337,-0.018544,-0.004327,0.014816,0.008551,0.008551,0.006046,0.006046,0.008551,0.008551,-0.017679,-0.018544,0.008551,0.006046,-0.032268,0.006314,0.007008,-0.011652
Salary Category_Above Average,0.017187,-0.059484,0.059484,1.0,-0.333247,-0.333247,-0.333247,-0.008061,-0.008061,-0.091954,0.041939,-0.050717,0.024201,-0.008061,-0.008061,-0.008061,0.066647,-0.011401,-0.017209,-0.019755,-0.011401,0.011414,-0.008061,-0.008061,-0.011401,-0.011401,-0.007491,-0.011401,-0.011401,0.024201,-0.008061,0.003654,0.005121,-0.013965
Salary Category_Below Average,0.119996,-0.184453,0.184453,-0.333247,1.0,-0.33342,-0.33342,0.024189,-0.008065,0.102257,0.02431,-0.00663,-0.008065,-0.008065,-0.008065,-0.008065,-0.069629,0.011402,0.017219,-0.019765,-0.011407,-0.011407,-0.008065,-0.008065,-0.011407,0.011402,-0.035501,0.011402,-0.011407,-0.008065,-0.008065,-0.039849,-0.015318,-0.013972
Salary Category_Bottom,-0.047511,-0.20175,0.20175,-0.333247,-0.33342,1.0,-0.33342,-0.008065,-0.008065,0.171006,-0.068505,0.103672,-0.008065,-0.008065,-0.008065,-0.008065,-0.164467,-0.011407,0.010329,-0.019765,0.034212,0.011402,0.024189,0.024189,0.034212,0.011402,0.086444,-0.011407,0.034212,-0.008065,-0.008065,-0.049507,-0.015318,-0.013972
Salary Category_Top,-0.089668,0.445671,-0.445671,-0.333247,-0.33342,-0.33342,1.0,-0.008065,0.024189,-0.181334,0.002267,-0.046338,-0.008065,0.024189,0.024189,0.024189,0.167466,0.011402,-0.010343,0.059279,-0.011407,-0.011407,-0.008065,-0.008065,-0.011407,-0.011407,-0.043454,0.011402,-0.011407,-0.008065,0.024189,0.085702,0.025516,0.041904
Job Title_Associate Machine Learning Engineer,0.001316,-0.006046,0.006046,-0.008061,0.024189,-0.008065,-0.008065,1.0,-0.000195,-0.007582,-0.006652,-0.001441,-0.000195,-0.000195,-0.000195,-0.000195,-0.011941,-0.000276,-0.000917,-0.000478,-0.000276,-0.000276,-0.000195,-0.000195,-0.000276,-0.000276,-0.005476,-0.000276,-0.000276,-0.000195,-0.000195,-0.001314,-0.000617,-0.000338
Job Title_CEO,-0.004494,0.032268,-0.032268,-0.008061,-0.008065,-0.008065,0.024189,-0.000195,1.0,-0.007582,-0.006652,-0.001441,-0.000195,-0.000195,-0.000195,-0.000195,-0.011941,-0.000276,-0.000917,-0.000478,-0.000276,-0.000276,-0.000195,-0.000195,-0.000276,-0.000276,-0.005476,-0.000276,-0.000276,-0.000195,-0.000195,-0.001314,-0.000617,-0.000338
Job Title_Data Analyst,0.169879,-0.005287,0.005287,-0.091954,0.102257,0.171006,-0.181334,-0.007582,-0.007582,1.0,-0.258557,-0.056008,-0.007582,-0.007582,-0.007582,-0.007582,-0.464114,-0.010724,-0.035637,-0.018582,-0.010724,-0.010724,-0.007582,-0.007582,-0.010724,-0.010724,-0.212819,-0.010724,-0.010724,-0.007582,-0.007582,-0.051083,-0.023998,-0.013135


<b></b>

<b>I used OneHotEncoder in order to see correlations between specific Job Title/Roles and Salary Category.
Surprisingly the best correlation is between Top Salary Category and Role_Manager (0.44), so managers get paid pretty well.
And even more surprising, that the second best correlation (0.2), is between Role_Specialist and Salary Category Bottom, 
so Specialist does not make much nowadays... (or it's just the data)</b>