In [148]:
import pandas as pd   # package for data analysis
import numpy as np    # package for numerical computations

# libraries for visualization
import matplotlib.pyplot as plt
import seaborn as sns

# to ignore warnings
import warnings
warnings.filterwarnings('ignore')

# For Preprocessing, ML models and Evaluation
from sklearn.model_selection import train_test_split   # To split the dataset into train and test set

from sklearn.linear_model import LogisticRegression, LinearRegression     # Logistic regression model
from sklearn.ensemble import RandomForestClassifier

from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder    # for converting categorical to numerical

from sklearn.metrics import f1_score    # for model evaluation

In [99]:
train_data = pd.read_csv('Final_Train_Dataset.csv')
train_data.head()

Unnamed: 0.1,Unnamed: 0,experience,job_description,job_desig,job_type,key_skills,location,salary,company_name_encoded
0,0,5-7 yrs,Exp: Minimum 5 years;Good understanding of IOC...,Senior Exploit and Vulnerability Researcher,,"team skills, communication skills, analytical ...",Delhi NCR(Vikas Puri),6to10,3687
1,1,10-17 yrs,He should have handled a team of atleast 5-6 d...,Head SCM,,"ppc, logistics, inventory management, supply c...",Sonepat,10to15,458
2,2,5-9 yrs,Must be an effective communicator (written & s...,Deputy Manager - Talent Management & Leadershi...,Analytics,"HR Analytics, Employee Engagement, Training, S...",Delhi NCR,15to25,4195
3,3,7-10 yrs,7 - 10 years of overall experience in data e...,Associate Manager Data Engineering,Analytics,"SQL, Javascript, Automation, Python, Ruby, Ana...",Bengaluru,10to15,313
4,4,1-3 yrs,Chartered Accountancy degree or MBA in Finance...,TS- GSA- Senior Analyst,,"accounting, finance, cash flow, financial plan...",Gurgaon,3to6,1305


In [100]:
test_data = pd.read_csv('Final_Test_Dataset.csv')
test_data.head()

Unnamed: 0,experience,job_description,job_desig,job_type,key_skills,location,company_name_encoded
0,7-12 yrs,Professional experience in Java/J2EE based ser...,IT Technology Senior Consultant/java/ J2ee/ Se...,,"Java, J2Ee, Tomcat, JBoss, Weblogic, Oracle, E...",Bengaluru,2066
1,0-5 yrs,We are looking for 20+ Fresher/Experienced Can...,Medical Billing Process | International KPO | ...,,"Medical Billing, Insurance Processing",Ahmedabad(Sola),2629
2,3-6 yrs,Should understand overall integration framewor...,Oracle Sales Cloud Functional Consultant,,"Oracle Sales, Functional Consultancy, Troubles...",Bengaluru,2448
3,0-3 yrs,,Looking For Freshers WHO WANT To Work WITH US,,"offline, online, internet, part time, home bas...","Delhi NCR, Chennai, Hyderabad, Gurgaon, Luckno...",2711
4,0-5 yrs,,Process Associate / Sr Process Associate / Tec...,,"voice support, analytical skills, Process asso...",Hyderabad,40


In [101]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19802 entries, 0 to 19801
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Unnamed: 0            19802 non-null  int64 
 1   experience            19802 non-null  object
 2   job_description       15384 non-null  object
 3   job_desig             19802 non-null  object
 4   job_type              4797 non-null   object
 5   key_skills            19801 non-null  object
 6   location              19802 non-null  object
 7   salary                19802 non-null  object
 8   company_name_encoded  19802 non-null  int64 
dtypes: int64(2), object(7)
memory usage: 1.4+ MB


In [102]:
train_data.drop(['Unnamed: 0', 'job_description', 'job_desig', 'key_skills'], axis=1, inplace=True)
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19802 entries, 0 to 19801
Data columns (total 5 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   experience            19802 non-null  object
 1   job_type              4797 non-null   object
 2   location              19802 non-null  object
 3   salary                19802 non-null  object
 4   company_name_encoded  19802 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 773.6+ KB


In [103]:
test_data.drop(['job_description', 'job_desig', 'key_skills'], axis=1, inplace=True)
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6601 entries, 0 to 6600
Data columns (total 4 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   experience            6601 non-null   object
 1   job_type              1637 non-null   object
 2   location              6601 non-null   object
 3   company_name_encoded  6601 non-null   int64 
dtypes: int64(1), object(3)
memory usage: 206.4+ KB


### Dealing with missing values

In [104]:
train_data['job_type'].value_counts()

Analytics    3747
analytics     921
ANALYTICS      77
analytic       32
Analytic       20
Name: job_type, dtype: int64

In [105]:
train_data['job_type'] = train_data['job_type'].replace(to_replace=
                            ['Analytics', 'analytics', 'ANALYTICS', 'analytic', 'Analytic'], value='ANALYTICS')
train_data['job_type'].value_counts()

ANALYTICS    4797
Name: job_type, dtype: int64

In [106]:
test_data['job_type'] = test_data['job_type'].replace(to_replace=
                            ['Analytics', 'analytics', 'ANALYTICS', 'analytic', 'Analytic'], value='ANALYTICS')
test_data['job_type'].value_counts()

ANALYTICS    1637
Name: job_type, dtype: int64

In [107]:
train_data['job_type'].isnull().value_counts()

True     15005
False     4797
Name: job_type, dtype: int64

In [108]:
test_data['job_type'].isnull().value_counts()

True     4964
False    1637
Name: job_type, dtype: int64

In [109]:
train_data['job_type'].replace(np.nan, 'NON-ANALYTICS', inplace=True )

In [110]:
test_data['job_type'].replace(np.nan, 'NON-ANALYTICS', inplace=True )

In [111]:
train_data['job_type'].isnull().value_counts()

False    19802
Name: job_type, dtype: int64

In [112]:
test_data['job_type'].isnull().value_counts()

False    6601
Name: job_type, dtype: int64

In [113]:
train_data.isnull().sum()

experience              0
job_type                0
location                0
salary                  0
company_name_encoded    0
dtype: int64

In [114]:
test_data.isnull().sum()

experience              0
job_type                0
location                0
company_name_encoded    0
dtype: int64

In [115]:
train_data.head()

Unnamed: 0,experience,job_type,location,salary,company_name_encoded
0,5-7 yrs,NON-ANALYTICS,Delhi NCR(Vikas Puri),6to10,3687
1,10-17 yrs,NON-ANALYTICS,Sonepat,10to15,458
2,5-9 yrs,ANALYTICS,Delhi NCR,15to25,4195
3,7-10 yrs,ANALYTICS,Bengaluru,10to15,313
4,1-3 yrs,NON-ANALYTICS,Gurgaon,3to6,1305


In [116]:
train_data.experience.value_counts().sort_values(ascending=False)

5-10 yrs     1274
2-5 yrs      1188
3-8 yrs       922
2-7 yrs       832
4-9 yrs       678
             ... 
8-8 yrs         1
17-25 yrs       1
20-25 yrs       1
20-26 yrs       1
12-13 yrs       1
Name: experience, Length: 129, dtype: int64

In [117]:
test_data.experience.value_counts().sort_values(ascending=False)

5-10 yrs     419
2-5 yrs      416
3-8 yrs      336
2-7 yrs      267
3-5 yrs      249
            ... 
20-25 yrs      1
2-2 yrs        1
20-22 yrs      1
25-30 yrs      1
17-19 yrs      1
Name: experience, Length: 110, dtype: int64

In [118]:
years = train_data.experience[0].split(' ')[0]
int(years[0]) + int(years[2]) / 2

8.5

In [119]:
def convert_years(x):
    years = x.split(' ')[0]
    years = years.split('-')
    return int(years[0]) + int(years[1]) / 2

In [120]:
def convert_exp_years(x):
    years = x.split(' ')[0]
    years = years.split('-')
    return int(years[0]) + int(years[1]) / 2

In [121]:
print(convert_exp_years(test_data.experience[0]))

13.0


In [122]:
train_data['experience'] = train_data.experience.apply(lambda x: convert_years(x))

In [123]:
test_data['experience'] = test_data.experience.apply(lambda x: convert_exp_years(x))

In [124]:
train_data.head()

Unnamed: 0,experience,job_type,location,salary,company_name_encoded
0,8.5,NON-ANALYTICS,Delhi NCR(Vikas Puri),6to10,3687
1,18.5,NON-ANALYTICS,Sonepat,10to15,458
2,9.5,ANALYTICS,Delhi NCR,15to25,4195
3,12.0,ANALYTICS,Bengaluru,10to15,313
4,2.5,NON-ANALYTICS,Gurgaon,3to6,1305


In [125]:
test_data.head()

Unnamed: 0,experience,job_type,location,company_name_encoded
0,13.0,NON-ANALYTICS,Bengaluru,2066
1,2.5,NON-ANALYTICS,Ahmedabad(Sola),2629
2,6.0,NON-ANALYTICS,Bengaluru,2448
3,1.5,NON-ANALYTICS,"Delhi NCR, Chennai, Hyderabad, Gurgaon, Luckno...",2711
4,2.5,NON-ANALYTICS,Hyderabad,40


In [126]:
train_data.salary.value_counts()

10to15    4500
15to25    4125
6to10     3533
0to3      3245
3to6      2824
25to50    1575
Name: salary, dtype: int64

In [127]:
salary_map  = {'10to15':1,
              '15to25':2,
              '6to10':3,
              '0to3':4,
              '3to6':5,
              '25to50':6}
train_data['salary'] = train_data.salary.map(salary_map)
train_data.salary.value_counts()

1    4500
2    4125
3    3533
4    3245
5    2824
6    1575
Name: salary, dtype: int64

In [27]:
def convert_salary(x):
    salary = x.split('to')
    return (int(salary[0]) + int(salary[1]) / 2) * 100000

In [28]:
train_data['salary'] = train_data['salary'].apply(lambda x: convert_salary(x))

In [29]:
train_data.salary.value_counts()

1750000.0    4500
2750000.0    4125
1100000.0    3533
150000.0     3245
600000.0     2824
5000000.0    1575
Name: salary, dtype: int64

In [128]:
train_data.head()

Unnamed: 0,experience,job_type,location,salary,company_name_encoded
0,8.5,NON-ANALYTICS,Delhi NCR(Vikas Puri),3,3687
1,18.5,NON-ANALYTICS,Sonepat,1,458
2,9.5,ANALYTICS,Delhi NCR,2,4195
3,12.0,ANALYTICS,Bengaluru,1,313
4,2.5,NON-ANALYTICS,Gurgaon,5,1305


In [82]:
train_data.location.value_counts(ascending=True)

Noida(Sector-144 Noida)                                                                         1
Coimbatore, Kochi, Mysore                                                                       1
Delhi NCR, Bengaluru, Chennai, Hyderabad, Kolkata                                               1
Hyderabad(Begumpet+1)                                                                           1
alipurduar, bankura, barddhaman, birbhum, balurghat, darjiling, howrah, hugli, jalpaiguri       1
                                                                                             ... 
Hyderabad                                                                                    1083
Pune                                                                                         1193
Gurgaon                                                                                      1644
Mumbai                                                                                       2508
Bengaluru           

In [129]:
# Import label encoder 
from sklearn import preprocessing 
  
# label_encoder object knows how to understand word labels. 
label_encoder = preprocessing.LabelEncoder() 
  
# Encode labels in column 'location' in train data. 
train_data['location']= label_encoder.fit_transform(train_data['location']) 
  
train_data['location'].unique()

array([ 597, 1412,  525, ...,  324,   65, 1117])

In [130]:
# Encode labels in column 'location' in test data. 
test_data['location']= label_encoder.fit_transform(test_data['location']) 
  
test_data['location'].unique()

array([ 56,  17, 321, 453, 194, 421, 793, 207, 546, 495, 232, 272, 439,
       704, 347, 413, 283, 303, 662, 667, 120, 628, 104, 271, 309, 365,
       336, 348, 212,  67, 632, 459, 408, 595, 516, 677, 265, 192, 422,
       190, 719, 565, 440, 138, 769,  15, 731, 326, 198, 751, 706, 466,
       467, 612, 732, 148, 123, 727, 779, 730, 101, 113, 131,  21, 545,
       340, 537, 534, 308,  60, 394, 796, 785, 737, 238, 405, 418, 226,
       181, 616, 512, 794, 221,  39, 211, 657, 629, 140, 475, 470,   4,
       563, 171, 330, 122,  73, 708,  53, 579, 367, 549, 747, 410, 111,
       588, 346, 485, 509,  44, 718, 536, 634, 331, 317, 318,  38, 357,
       761, 716, 332, 569, 748,   0, 322, 345, 598, 169, 778,   6, 266,
       765, 600, 184, 488, 699, 548, 547, 477, 768, 575, 275, 481, 191,
       788, 766,  97, 472, 582, 692, 143, 368, 576,  83, 615, 176, 542,
       776, 682,  48, 361, 596, 183, 524, 229, 556, 188, 508, 254, 772,
       220, 399, 126, 222,  34, 419, 301, 518, 312, 750, 432, 56

In [131]:
train_data.location.value_counts()

114     4168
1027    2508
811     1644
1301    1193
863     1083
        ... 
1205       1
1189       1
1173       1
1211       1
1495       1
Name: location, Length: 1504, dtype: int64

In [132]:
train_data.head()

Unnamed: 0,experience,job_type,location,salary,company_name_encoded
0,8.5,NON-ANALYTICS,597,3,3687
1,18.5,NON-ANALYTICS,1412,1,458
2,9.5,ANALYTICS,525,2,4195
3,12.0,ANALYTICS,114,1,313
4,2.5,NON-ANALYTICS,811,5,1305


In [133]:
test_data.head()

Unnamed: 0,experience,job_type,location,company_name_encoded
0,13.0,NON-ANALYTICS,56,2066
1,2.5,NON-ANALYTICS,17,2629
2,6.0,NON-ANALYTICS,56,2448
3,1.5,NON-ANALYTICS,321,2711
4,2.5,NON-ANALYTICS,453,40


In [134]:
dummies = pd.get_dummies(train_data.job_type)
train_data = pd.concat([train_data,dummies.drop('NON-ANALYTICS',axis='columns')],axis='columns')
train_data.head()

Unnamed: 0,experience,job_type,location,salary,company_name_encoded,ANALYTICS
0,8.5,NON-ANALYTICS,597,3,3687,0
1,18.5,NON-ANALYTICS,1412,1,458,0
2,9.5,ANALYTICS,525,2,4195,1
3,12.0,ANALYTICS,114,1,313,1
4,2.5,NON-ANALYTICS,811,5,1305,0


In [135]:
dummies = pd.get_dummies(test_data.job_type)
test_data = pd.concat([test_data,dummies.drop('NON-ANALYTICS',axis='columns')],axis='columns')
test_data.head()

Unnamed: 0,experience,job_type,location,company_name_encoded,ANALYTICS
0,13.0,NON-ANALYTICS,56,2066,0
1,2.5,NON-ANALYTICS,17,2629,0
2,6.0,NON-ANALYTICS,56,2448,0
3,1.5,NON-ANALYTICS,321,2711,0
4,2.5,NON-ANALYTICS,453,40,0


In [39]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19802 entries, 0 to 19801
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   experience            19802 non-null  float64
 1   job_type              19802 non-null  object 
 2   location              19802 non-null  int32  
 3   salary                19802 non-null  float64
 4   company_name_encoded  19802 non-null  int64  
 5   ANALYTICS             19802 non-null  uint8  
dtypes: float64(2), int32(1), int64(1), object(1), uint8(1)
memory usage: 715.6+ KB


In [136]:
train_data.drop(['job_type'], axis=1, inplace=True)

In [137]:
test_data.drop(['job_type'], axis=1, inplace=True)

In [138]:
X = train_data.drop('salary', axis=1)
y = train_data['salary']

In [139]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=9)

In [140]:
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression()

In [141]:
model.score(X_test, y_test)

0.01965957700329346

In [142]:
log_model = LogisticRegression()
log_model.fit(X_train, y_train)

LogisticRegression()

In [143]:
log_model.score(X_test, y_test)

0.3281999495077001

In [149]:
rfg_model = RandomForestClassifier()
rfg_model.fit(X_train, y_train)

RandomForestClassifier()

In [150]:
rfg_model.score(X_test, y_test)

0.3673314819490028

In [151]:
y_rfc = rfg_model.predict(X_test)

In [152]:
test_predict = rfg_model.predict(test_data)
test_predict

array([2, 4, 1, ..., 6, 2, 4], dtype=int64)

In [153]:
y_test_pred = pd.DataFrame(test_predict,columns=[   'SECTION'   ])
y_test_pred.to_csv('submission.csv', index=False)

In [154]:
pred_data = pd.read_csv('submission.csv')
pred_data.head()

Unnamed: 0,SECTION
0,2
1,4
2,1
3,4
4,4


In [157]:
salary_map  = {1:'10to15',
               2:'15to25',
               3:'6to10',
               4:'0to3',
               5:'3to6',
               6:'25to50'}
y_test_pred[   'SECTION'   ] = y_test_pred[   'SECTION'   ].map(salary_map)
y_test_pred.head()

Unnamed: 0,SECTION
0,15to25
1,0to3
2,10to15
3,0to3
4,0to3


In [160]:
y_test_pred.columns = [   'salary'   ]
y_test_pred.to_csv("submission.csv")