In [1]:
import re
import pickle
import numpy as np
import pandas as pd
import nltk
import pyforest
import texthero as hero
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
#importing the data
data = pd.read_csv('../inputs/fake_job_postings.csv')
target = data['fraudulent']
data.drop(['job_id','salary_range','department','benefits'],axis=1,inplace=True)

In [3]:
#removing target values form dataset
data.drop(['fraudulent'],axis=1,inplace=True)

In [4]:
#filling null values 
data['required_education'].fillna('Unspecified',inplace = True)
data['employment_type'].fillna('no_info_about_employment',inplace = True)
data['required_experience'].fillna('experience_not_asked',inplace = True)
data['industry'].fillna('industry_not_given',inplace = True)
data['function'].fillna('function_not_given',inplace = True)

In [5]:
dataset = data.copy()

In [6]:
#dealing catogorical data
cat_cols = ['employment_type','required_experience','required_education','industry','function']
for c in cat_cols:
    encoded = pd.get_dummies(data[c])
    data = pd.concat([data,encoded],axis = 1 )
cat_cols = ['employment_type','required_experience','required_education','industry','function','location','company_profile']
data.drop(cat_cols,axis=1,inplace=True)

In [7]:
#dealing with text data
description = data['title'] + ' ' + data['description']+ ' ' +  data['requirements']

In [8]:
clean_desc = description.pipe(hero.clean)

In [9]:
import nltk
from nltk import stem
from nltk import tokenize
from nltk.stem.snowball import SnowballStemmer

w_tokenizer = tokenize.WhitespaceTokenizer()


# Use English stemmer.
stemmer = stem.SnowballStemmer("english")

def stemming(text):
    '''
    input  : text 
    output :  lemmazitzed text
    '''
    return ' '.join([stemmer.stem(word) for word in w_tokenizer.tokenize(text)])
clean_desc = clean_desc.apply(stemming)

In [10]:
def remove_long_texts(text):
    words = [word for word in text.split() if len(word)<21]
    return ' '.join(words)
def remove_digits(text):
    return re.sub(r'[\d\|]', '', text)

clean_desc = clean_desc.apply(remove_long_texts)
clean_desc = clean_desc.apply(remove_digits)

In [11]:
tfidf = TfidfVectorizer( min_df = 0.05)
tfidf_features = tfidf.fit_transform(clean_desc) 
tfidf_vect_df = pd.DataFrame(tfidf_features.todense(), columns = tfidf.get_feature_names())
data = pd.concat([data, tfidf_vect_df], axis = 1)
data.drop(['title','description','requirements'],axis = 1 , inplace = True)

In [12]:
from imblearn.over_sampling import SMOTE

In [13]:
sm = SMOTE(random_state=0)
X_resampled, y_resampled = sm.fit_resample(data.values, target.values)

In [14]:
from sklearn import svm
from sklearn import metrics,model_selection

def score(df,model):
    
    for fold in range(0,5):
        scores = []
        df_train = df[df["kfold"] != fold].reset_index(drop=True)
        df_valid = df[df["kfold"] == fold].reset_index(drop=True)
        
        x_train = df_train.drop(['kfold','fraudulent'],axis=1)
        y_train = df_train.fraudulent.values
        
        
        x_valid = df_valid.drop(['kfold','fraudulent'],axis=1).values
        y_valid = df_valid.fraudulent.values
        
        model.fit(x_train,y_train)
        y_pred = model.predict(x_valid)
        
        scores.append(metrics.roc_auc_score(y_valid,y_pred))
        #print(metrics.confusion_matrix(y_valid,y_pred))

    score = np.mean(scores)   
    return score
    
def best_parameter(df):

    X = df.drop(['kfold','fraudulent'],axis=1)
    Y = df.fraudulent.values
        
    hyper = {'C':[0.001,0.01,10,100,1000],
             'gamma':[0.001,0.01,10,100,1000],
             'kernel':['linear']
            }
    
    gd=model_selection.GridSearchCV(estimator=svm.SVC(),
                                    scoring = 'roc_auc',
                                    param_grid=hyper,
                                    n_jobs=-1,
                                    verbose=True)
    
    gd.fit(X,Y)
    return gd.best_params_

In [15]:
def create_folds(df):
     #create a k-fold column and initialise it with -1
    df["kfold"] = -1
    
    #intialise kfold class
    kf = model_selection.StratifiedKFold(n_splits=5)
    
    #filling kfold columns
    for f,(t_,v_) in enumerate(kf.split(X=df,y=df.fraudulent)):
        df.loc[v_,'kfold'] = f
    return(df)

In [16]:
df = pd.DataFrame(X_resampled,columns=data.columns)
df['fraudulent'] = y_resampled
df = df.sample(frac=1).reset_index(drop = True)

In [17]:
#split the data for train and test
X = df.drop('fraudulent',axis=1)
y = df.fraudulent
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.20, random_state=2020)

<IPython.core.display.Javascript object>

In [18]:
train_data = X_train.copy()
train_data['fraudulent'] = y_train

In [19]:
pickle.dump(tfidf, open('../inputs/tf_idf.pkl', 'wb'))

In [28]:
df = create_folds(df)

In [29]:
#training our training dataset on the best hyperparameter 
model = svm.SVC()
intial_score = score(df,model)
print(f"Intial roc_auc Score is : {intial_score}")

Intial roc_auc Score is : 0.9844242112338889


In [20]:
list(train_data.columns)

(27222, 684)

In [31]:
[34,434]+[343]

[34, 434, 343]

In [21]:
tfidf.transform(['afbhadjgfhjadsgfjadhsgfdsga'])

<1x483 sparse matrix of type '<class 'numpy.float64'>'
	with 0 stored elements in Compressed Sparse Row format>

In [23]:
s = pd.Series(train_data.columns)

In [27]:
s.values.reshape(-1,1)

array([['telecommuting'],
       ['has_company_logo'],
       ['has_questions'],
       ['Contract'],
       ['Full-time'],
       ['Other'],
       ['Part-time'],
       ['Temporary'],
       ['no_info_about_employment'],
       ['Associate'],
       ['Director'],
       ['Entry level'],
       ['Executive'],
       ['Internship'],
       ['Mid-Senior level'],
       ['Not Applicable'],
       ['experience_not_asked'],
       ['Associate Degree'],
       ["Bachelor's Degree"],
       ['Certification'],
       ['Doctorate'],
       ['High School or equivalent'],
       ["Master's Degree"],
       ['Professional'],
       ['Some College Coursework Completed'],
       ['Some High School Coursework'],
       ['Unspecified'],
       ['Vocational'],
       ['Vocational - Degree'],
       ['Vocational - HS Diploma'],
       ['Accounting'],
       ['Airlines/Aviation'],
       ['Alternative Dispute Resolution'],
       ['Animation'],
       ['Apparel & Fashion'],
       ['Architecture & Plann