In [3]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from sklearn.svm import SVC

from nltk.corpus import stopwords


import warnings
warnings.simplefilter(action='ignore')

In [4]:
salary_data = pd.read_csv("cleaned_salary_data.csv")

In [5]:
x = salary_data.Location.value_counts()

In [6]:
list_of_others = []
for key,value in x.items():
    if value <= 20:
        list_of_others.append(key)
#print(list_of_others)

In [7]:
for i in range(len(salary_data)):
    if salary_data.loc[i,'Location'] in list_of_others:
        #count+=1
        salary_data.loc[i,'Location'] = 'Others'

In [8]:
salary_data = salary_data[salary_data['Location'] != 'None']

In [9]:
salary_data.Location.value_counts()

Others        352
Bengaluru     228
Pune          110
Hyderabad      99
Mumbai         89
Jaipur         83
Chennai        74
Ahmedabad      69
Delhi          63
Gurgaon        52
Noida          50
Indore         36
Kochi          34
Kolkata        34
Coimbatore     32
Mohali         30
New Delhi      30
India          28
Surat          25
Name: Location, dtype: int64

In [10]:
salary_data.shape

(1518, 5)

In [11]:
#drop all the salaries having 0 vlaues and salaries >200000
salary_data.drop(salary_data[salary_data['Salary']==0.0].index,inplace=True)
salary_data.drop(salary_data['Salary'][salary_data['Salary'].apply(lambda x: len(str(x).split('.',1)[0])>7)].index,inplace=True)
salary_data.drop(salary_data['Salary'][salary_data['Salary'].apply(lambda x: len(str(x).split('.',1)[0])<5)].index,inplace=True)
salary_data.drop(salary_data[salary_data['Salary']>900000].index,inplace=True)

In [12]:
salary_data.shape

(1352, 5)

In [13]:
min(salary_data['Salary'])

10000.0

In [14]:
max(salary_data['Salary'])

900000.0

In [15]:
#function to classify salary as High, Medium and low
def categorize_salary(salary):
    if salary <= 250000:
           return 'Low'
    elif (salary > 250000) & (salary  <= 420000):
        return 'Medium'
    else:
        return 'High'

In [16]:
salary_data['Salary'] = salary_data['Salary'].apply(lambda x: str(x))

salary_data['Salary'] = salary_data['Salary'].apply(lambda x: categorize_salary(float(x)))

salary_data['Salary'].value_counts(normalize = True)

Medium    0.360947
Low       0.352811
High      0.286243
Name: Salary, dtype: float64

### Trying to predict salary with Titles and Location using SVM

In [17]:
X_titles = salary_data['Title']

In [19]:
X_titles.shape

(1352,)

In [20]:
custom_stop_words = ['role','risk','specialist','company','program','multiple','process','machine','data']

In [21]:
stopwords = stopwords.words('english')
stopwords.extend(custom_stop_words)

In [22]:
#define a funciton to generate tfidf vector from text data
def create_tfidf_vec(data):
    tfidf = TfidfVectorizer(stop_words=stopwords,max_df=1000,min_df=1,sublinear_tf=True,ngram_range=(1,2))
    tfidf.fit(data)
    X_vec = pd.DataFrame(tfidf.transform(data).todense(), columns=tfidf.get_feature_names())
    return X_vec

In [23]:
X_titles_vec = create_tfidf_vec(X_titles)

In [24]:
X_titles_vec.shape

(1352, 22)

In [25]:
X_titles_vec.columns

Index(['analyst', 'app', 'app developer', 'cloud', 'cloud devops', 'devops',
       'devops engineer', 'engineer', 'full', 'full stack', 'learning',
       'learning engineer', 'research', 'research analyst', 'scientist',
       'scientist analyst', 'software', 'software developer', 'stack',
       'stack developer', 'web', 'web developer'],
      dtype='object')

In [26]:
#use this information to create your own custom words list. remove all the words that are not giving much information by adding them
#to stop words
word_counts = X_titles_vec.sum(axis=0)
word_counts.sort_values(ascending = False).head(100)

web developer         398.101118
web                   398.101118
software developer    188.797511
software              188.797511
stack developer       124.500000
stack                 124.500000
full                  124.500000
full stack            124.500000
app                   104.651804
app developer         104.651804
analyst                31.529979
engineer               28.686654
scientist              24.328718
scientist analyst      24.328718
devops engineer        18.698100
devops                 18.698100
cloud devops           18.698100
cloud                  18.698100
learning engineer      14.746329
learning               14.746329
research               11.742687
research analyst       11.742687
dtype: float64

In [27]:
salary_data['Salary'].value_counts()

Medium    488
Low       477
High      387
Name: Salary, dtype: int64

In [28]:
#function to resample class to make the data balanced

def upsample_class(salary_data):
    df_low = salary_data[salary_data['Salary']=='Low']
    df_medium = salary_data[salary_data['Salary']=='Medium']
    df_high = salary_data[salary_data['Salary']=='High']  
    
    # Upsample minority class
    df_high_upsampled = resample(df_high, 
                                     replace=True,     # sample with replacement
                                     n_samples=420,    # to match majority class
                                     random_state=47) # reproducible results

    df_upsampled = pd.concat([df_low,df_high_upsampled,df_medium])
 
    return df_upsampled

In [29]:
salary_data.shape

(1352, 5)

In [30]:
df_upsampled = upsample_class(salary_data)
df_upsampled.shape

(1385, 5)

In [31]:
#preprocess the text of resampled dataframe
X_upsampled_titles = df_upsampled['Title']

In [32]:
X_upsampled_titles.shape

(1385,)

In [33]:
#generate the Tfidf vector from resampled dataframe
X_upsampled_titles_vec = create_tfidf_vec(X_upsampled_titles)
X_upsampled_titles_vec.shape

(1385, 22)

In [34]:
df_upsampled['Salary'].shape

(1385,)

In [35]:
df_upsampled.reset_index(drop = True, inplace = True)

In [36]:
df_upsampled.head()

Unnamed: 0,Title,Location,Company,Salary,Description
0,App Developer,Gurgaon,Atechnos,Low,profile abhay techno service pvt ltd atechnos ...
1,App Developer,Mohali,Mansa Infotech® Pvt. Ltd.,Low,motivated talented android developer talented ...
2,App Developer,Pune,Impel Task HR Pvt Ltd,Low,requirement android developer jd follows 1 2 p...
3,App Developer,Noida,FoundLay Technologies Pvt. Ltd.,Low,immediate joining 6 month hand mobile app mob...
4,Data Scientist and Analyst,Others,Indian Elites Outsourcing Services,Low,perform market research activity market mappin...


In [37]:
#getting dummies of location
df_upsampled_location = pd.get_dummies(df_upsampled['Location'])

In [38]:
X_upsampled_titles_vec.shape, df_upsampled_location.shape

((1385, 22), (1385, 19))

In [39]:
df_upsampled_location.reset_index(drop=True,inplace=True)

In [42]:
df_upsampled_location.columns

Index(['Ahmedabad', 'Bengaluru', 'Chennai', 'Coimbatore', 'Delhi', 'Gurgaon',
       'Hyderabad', 'India', 'Indore', 'Jaipur', 'Kochi', 'Kolkata', 'Mohali',
       'Mumbai', 'New Delhi', 'Noida', 'Others', 'Pune', 'Surat'],
      dtype='object')

In [43]:
#concatinating job titles vector with location
X_upsampled_titles_location_vec = pd.concat([X_upsampled_titles_vec,df_upsampled_location],axis=1,ignore_index=True)

In [44]:
X_upsampled_titles_location_vec.shape

(1385, 41)

In [45]:
df_upsampled_location.reset_index(drop=True,inplace=True)

In [46]:
#concatenating job titles vector with location
X_upsampled_titles_location_vec = pd.concat([X_upsampled_titles_vec,df_upsampled_location],axis=1,ignore_index=True)

In [47]:
X_upsampled_titles_location_vec.shape

(1385, 41)

In [48]:
df_upsampled['Salary'].shape

(1385,)

In [49]:
col_names = list(X_titles_vec.columns) + list(df_upsampled_location)
col_names

['analyst',
 'app',
 'app developer',
 'cloud',
 'cloud devops',
 'devops',
 'devops engineer',
 'engineer',
 'full',
 'full stack',
 'learning',
 'learning engineer',
 'research',
 'research analyst',
 'scientist',
 'scientist analyst',
 'software',
 'software developer',
 'stack',
 'stack developer',
 'web',
 'web developer',
 'Ahmedabad',
 'Bengaluru',
 'Chennai',
 'Coimbatore',
 'Delhi',
 'Gurgaon',
 'Hyderabad',
 'India',
 'Indore',
 'Jaipur',
 'Kochi',
 'Kolkata',
 'Mohali',
 'Mumbai',
 'New Delhi',
 'Noida',
 'Others',
 'Pune',
 'Surat']

In [52]:
svm_model = SVC(kernel='rbf',C=5,gamma=0.5)

In [53]:
X = X_upsampled_titles_location_vec
Y = df_upsampled['Salary']

svm_model.fit(X,Y)


training_score = svm_model.score(X,Y)
#testing_score = svm_model.score(X_test,y_test)
#cv_train_score = np.mean(cross_val_score(svm_model,X_train,y_train,cv=5,n_jobs=3))
#cv_test_score = np.mean(cross_val_score(svm_model,X_test,y_test,cv=5,n_jobs=3))
#y_pred = svm_model.predict(X_test)

In [54]:
training_score

0.5725631768953069

In [55]:
from sklearn.externals import joblib 
  
# Save the model as a pickle in a file 
joblib.dump(svm_model, 'model_salary.pkl') 

['model_salary.pkl']

In [56]:
svm_from_joblib = joblib.load('model_salary.pkl') 

In [57]:
def single_prediction(var,index):
    var1 = var['Title']
    var_vec = create_tfidf_vec(var1)
    var_vector = np.zeros((1,X_titles_vec.shape[1]+df_upsampled_location.shape[1]))
    df = pd.DataFrame(var_vector, columns=col_names)
    for i in var_vec.columns:
        if i in df.columns:
            df[i] = var_vec[i]
        if var['Location'][index] in df.columns:
            df[var['Location']] = 1
        else:
            df['Others'] = 1
    prediction = svm_from_joblib.predict(df)
    if prediction == 'High':
        return "450000 - 900000"
    elif prediction == 'Medium':
        return "200000 - 450000"
    else: return "Below 200000"
    


In [58]:
location = 'Mumbai'
title = 'Analyst'
data = [[location, title]]
df = pd.DataFrame(data, columns = ['Location', 'Title']) 
df

Unnamed: 0,Location,Title
0,Mumbai,Analyst


In [59]:
print(single_prediction(df,0))

Below 200000
