In [1]:
import re
import numpy as np
import pandas as pd
import nltk
import texthero as hero
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
#importing the data
data = pd.read_csv('../inputs/fake_job_postings.csv')
target = data['fraudulent']
data.drop(['job_id','salary_range','department','benefits'],axis=1,inplace=True)

In [3]:
#removing target values form dataset
data.drop(['fraudulent'],axis=1,inplace=True)

In [4]:
#filling null values 
data['required_education'].fillna('no_info_about_education',inplace = True)
data['employment_type'].fillna('no_info_about_employment',inplace = True)
data['required_experience'].fillna('experience_not_asked',inplace = True)
data['industry'].fillna('industry_not_given',inplace = True)
data['function'].fillna('function_not_given',inplace = True)

In [5]:
#dealing with company profile feature
for i in range(len(data.company_profile)):
    if data.company_profile[i]=='NaN':
        data.company_profile[i] = 'company_profile_not_given'
    else:
        data.company_profile[i] = 'company_profile_given'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [6]:
#dealing catogorical data
from sklearn.preprocessing import LabelEncoder
cat_cols = ['employment_type','required_experience','required_education','industry','function','company_profile']
for c in cat_cols:
    encoded = pd.get_dummies(data[c])
    data = pd.concat([data,encoded],axis = 1 )
cat_cols = ['employment_type','required_experience','required_education','industry','function','location','company_profile']
data.drop(cat_cols,axis=1,inplace=True)
print(data.columns)

Index(['title', 'description', 'requirements', 'telecommuting',
       'has_company_logo', 'has_questions', 'Contract', 'Full-time', 'Other',
       'Part-time',
       ...
       'Quality Assurance', 'Research', 'Sales', 'Science',
       'Strategy/Planning', 'Supply Chain', 'Training', 'Writing/Editing',
       'function_not_given', 'company_profile_given'],
      dtype='object', length=205)


In [7]:
#dealing with text data
description = data['title'] + ' ' + data['description']+ ' ' +  data['requirements']

In [8]:
clean_desc = description.pipe(hero.clean)

In [9]:
import nltk
from nltk import stem
from nltk import tokenize
from nltk.stem.snowball import SnowballStemmer

w_tokenizer = tokenize.WhitespaceTokenizer()


# Use English stemmer.
stemmer = stem.SnowballStemmer("english")

def stemming(text):
    '''
    input  : text 
    output :  lemmazitzed text
    '''
    return ' '.join([stemmer.stem(word) for word in w_tokenizer.tokenize(text)])
clean_desc = clean_desc.apply(stemming)

In [10]:
def remove_long_texts(text):
    words = [word for word in text.split() if len(word)<21]
    return ' '.join(words)
def remove_digits(text):
    return re.sub(r'[\d\|]', '', text)

clean_desc = clean_desc.apply(remove_long_texts)
clean_desc = clean_desc.apply(remove_digits)

In [11]:
tfidf = TfidfVectorizer( min_df = 0.05)
tfidf_features = tfidf.fit_transform(clean_desc) 
tfidf_vect_df = pd.DataFrame(tfidf_features.todense(), columns = tfidf.get_feature_names())
data = pd.concat([data, tfidf_vect_df], axis = 1)
data.drop(['title','description','requirements'],axis = 1 , inplace = True)

In [12]:
data['fraudulent'] = target

In [13]:
from sklearn import svm
from sklearn import metrics,model_selection

def score(df,model):
    
    for fold in range(0,5):
        scores = []
        df_train = df[df["kfold"] != fold].reset_index(drop=True)
        df_valid = df[df["kfold"] == fold].reset_index(drop=True)
        
        x_train = df_train.drop(['kfold','fraudulent'],axis=1)
        y_train = df_train.fraudulent.values
        
        
        x_valid = df_valid.drop(['kfold','fraudulent'],axis=1).values
        y_valid = df_valid.fraudulent.values
        
        model.fit(x_train,y_train)
        y_pred = model.predict(x_valid)
        
        scores.append(metrics.roc_auc_score(y_valid,y_pred))
        #print(metrics.confusion_matrix(y_valid,y_pred))

    score = np.mean(scores)   
    return score
    
def best_parameter(df):

    X = df.drop(['kfold','fraudulent'],axis=1)
    Y = df.fraudulent.values
        
    hyper = {'C':[0.001,0.01,10,100,1000],
             'gamma':[0.001,0.01,10,100,1000],
             'kernel':['linear']
            }
    
    gd=model_selection.GridSearchCV(estimator=svm.SVC(),
                                    scoring = 'roc_auc',
                                    param_grid=hyper,
                                    n_jobs=-1,
                                    verbose=True)
    
    gd.fit(X,Y)
    return gd

In [14]:
def create_folds(df):
     #create a k-fold column and initialise it with -1
    df["kfold"] = -1
    

    
    #intialise kfold class
    kf = model_selection.StratifiedKFold(n_splits=5)
    
    #filling kfold columns
    for f,(t_,v_) in enumerate(kf.split(X=df,y=df.fraudulent)):
        df.loc[v_,'kfold'] = f
    return(df)
        

df = data.sample(frac=1).reset_index(drop = True)

df = create_folds(df)

In [15]:
#calculate intial score without hyperparameter tuning 
#using K-stratified K fold
model = svm.SVC()
intial_score = score(df,model)
print(f"Intial roc_auc Score is : {intial_score}")

Intial roc_auc Score is : 0.7335163295222339


In [None]:
#calculter score for test data
x_test = test_df.drop('fraudulent',axis=1).values
y_test = test_df.fraudulent.values
y_pred = model.predict(x_test)
test_score = metrics.roc_auc_score(y_test,y_pred)
print(f'Intial Test score : {test_score}')

#tune hyperparameter and getting best parameters
params = best_parameter(os_df)
print(f"Best parameter are {params}")

#training our training dataset on the best hyperparameter 
model = svm.SVC(**params)

#fit the model for oversampled data
X = os_df.drop('fraudulent',axis=1).values
y = os_df.fraudulent.values
model.fit(X,y)

#calculter score for test data
y_pred = model.predict(x_test)
test_score = metrics.roc_auc_score(y_test,y_pred)
print(f'Test score : {test_score}')


In [21]:
df.head()

AttributeError: 'NoneType' object has no attribute 'head'

Unnamed: 0,telecommuting,has_company_logo,has_questions,Contract,Full-time,Other,Part-time,Temporary,no_info_about_employment,Associate,...,within,without,word,work,world,would,write,written,year,fraudulent
0,0,1,0,0,0,1,0,0,0,0,...,0.000000,0.0,0.0,0.177760,0.000000,0.000000,0.000000,0.000000,0.000000,0
1,0,1,0,0,1,0,0,0,0,0,...,0.000000,0.0,0.0,0.113106,0.244049,0.000000,0.000000,0.000000,0.000000,0
2,0,1,0,0,0,0,0,0,1,0,...,0.000000,0.0,0.0,0.062211,0.000000,0.000000,0.000000,0.000000,0.000000,0
3,0,1,0,0,1,0,0,0,0,0,...,0.075989,0.0,0.0,0.040416,0.087206,0.000000,0.043477,0.035394,0.050145,0
4,0,1,1,0,1,0,0,0,0,0,...,0.000000,0.0,0.0,0.094600,0.000000,0.000000,0.000000,0.000000,0.039124,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17875,0,1,1,0,1,0,0,0,0,0,...,0.048356,0.0,0.0,0.102876,0.110989,0.000000,0.000000,0.000000,0.000000,0
17876,0,1,1,0,1,0,0,0,0,0,...,0.052043,0.0,0.0,0.138401,0.000000,0.065267,0.059554,0.000000,0.068687,0
17877,0,0,0,0,1,0,0,0,0,0,...,0.000000,0.0,0.0,0.065732,0.000000,0.000000,0.000000,0.000000,0.054369,0
17878,0,0,1,1,0,0,0,0,0,0,...,0.000000,0.0,0.0,0.056261,0.000000,0.000000,0.000000,0.000000,0.000000,0
