In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Importing libraries for data exploration and anlysis
import sklearn as sk
import seaborn as sns
import matplotlib.pyplot as plt

#Model from SciKit-Learn
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn import feature_selection
from sklearn.impute import SimpleImputer

# Model Evaluations from SciKit Learn
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix,precision_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import precision_score


# for displaying graph in the notebook
%matplotlib inline

# [Real or Fake] : Fake Job Description Prediction
*  This dataset contains 18K job descriptions out of which about 800 are fake. The data consists of both textual information and meta-information about the jobs. The dataset can be used to create classification models which can learn the job descriptions which are fraudulent.

### Kaggle
    * https://www.kaggle.com/shivamb/real-or-fake-fake-jobposting-prediction

### Data Source 
     * Employment Scam Aegean Dataset 

### Execution Strategy  - An end-to-end Scikit-Learn worfklow
     1. Getting the data ready
     2. Handling NaN data and convert categorical data into Numeric
     3. Choosing the right maching learning estimator/aglorithm/model for this problem
     4. Fitting your chosen machine learning model to data and using it to make a prediction
     5. Evaluting a machine learning model
     6. Improving predictions through experimentation (hyperparameter tuning)
     7. Feature Importance Evaluations

In [None]:
#load data
df_job=pd.read_csv('/kaggle/input/real-or-fake-fake-jobposting-prediction/fake_job_postings.csv')
df_job.head(2)

# Data Conversation - Handling Missing Date {Sample Imputer}

In [None]:
# Calculating NUll data
df_job.isna().sum()

In [None]:
#reviewing data structure
df_job.dtypes

In [None]:
# Filling the Categorical values with 'missing'
data_cat_imp=SimpleImputer(strategy="constant",fill_value="Missing")
cat_imp_feature=["title","location","department","salary_range","company_profile","description","requirements","benefits",
                 "employment_type","required_experience","required_education","industry","function"]

# Filling the Numerical values through existing value
data_num_imp=SimpleImputer(strategy="constant",fill_value=None)
num_imp_feature =["job_id","telecommuting","has_company_logo","has_questions","fraudulent"]

# Transforming into column
data_imp_trans=ColumnTransformer([("data_cat_imp",data_cat_imp,cat_imp_feature),
                                 ("data_num_imp",data_num_imp,num_imp_feature)])

# Transforming and assigning the data
transformed_data=data_imp_trans.fit_transform(df_job)
transformed_data

In [None]:
#Transforming the data into data frame
df_job_transformed_data=pd.DataFrame(transformed_data,
                         columns=["title","location","department","salary_range","company_profile","description",
                                  "requirements","benefits", "employment_type","required_experience","required_education",
                                  "industry","function","job_id","telecommuting","has_company_logo","has_questions",
                                  "fraudulent"])

In [None]:
#viewing transformed data
df_job_transformed_data.head(2)

In [None]:
# verify the NaN/missing values
df_job_transformed_data.isna().sum()

In [None]:
#reviewing the columns
df_job_transformed_data.columns

In [None]:
#random seed
np.random.seed(42)

#data split into feature(X) and label(y)
X_trans = df_job_transformed_data.drop("fraudulent",axis=1)
y_trans = df_job_transformed_data.fraudulent
y_trans=y_trans.astype('int')

#shape(row,column) of features and label
X_trans.shape, y_trans.shape,X_trans.columns

# Data Conversation - Encode Categorical Data {OneHotEncoder}¶

In [None]:
# Instantation of One Hot Encoder for categorical data tarnsformatio into Numeric 
one_hot=OneHotEncoder()
clf_trans=ColumnTransformer([("one_hot",one_hot,cat_imp_feature)],remainder="passthrough")
X_trans_fin=clf_trans.fit_transform(X_trans)
np.array(X_trans_fin)

In [None]:
#splitting the data into train and test with 23% reserved for testing and 77% for training
X_train,X_test,y_train,y_test=train_test_split(X_trans_fin,y_trans,test_size=0.23, random_state=42)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

# Selecting right Estimator or Aglorithm - Applying Classification Models

In [None]:
#Lets fit the model
np.random.seed(42)

#Applying Random Forest Classifier Model
model_rfm=RandomForestClassifier()

#fitting the data into model
model_rfm.fit(X_train,y_train,sample_weight=None)

In [None]:
#scoring the Random Forest Classifier Model
print(f"Fake Job Random Forest Model Accuracy : {model_rfm.score(X_test,y_test)*100:.2f}%")

In [None]:
#predicting label data through Random Forest Classifier Model
y_pred_rfm=model_rfm.predict(X_test)
y_pred_rfm

In [None]:
#Applying Logistic Regression Classification Algorithm
model_lrm=LogisticRegression(solver='liblinear')

#fitting the data into model
model_lrm.fit(X_train,y_train,sample_weight=None)

In [None]:
#scoring the Logistic Regression Model
print(f"Fake Job Logistic Regression Model Accuracy :{model_lrm.score(X_test,y_test)*100:.2f}%")

In [None]:
#predicting label data through Random Forest Classifier Model
y_pred_lrm=model_lrm.predict(X_test)
y_pred_lrm

In [None]:
model_lrm.get_params()

# Applying Metrics
    * To quantifying the quality of predictions
    * score measures how many labels the model got right out of the total number of predictions

In [None]:
#accuracy metrics of Random forest
print(f"Accuracy Score ~ :{accuracy_score(y_test,y_pred_rfm)*100:.2f}%")

In [None]:
#precision score of Random forest
print(f"Precision Score~ :{precision_score(y_test,y_pred_rfm)*100:.2f}%")

In [None]:
#classification report
print(classification_report(y_test,y_pred_rfm))

In [None]:
# Confusion Matrix - It's compare to the label model predict and the actual label it suppossed to predict, 
# its offer an ideal where the model is getting confused.
rfm_data=confusion_matrix(y_test,y_pred_rfm)
sns.set(font_scale=1)
sns.heatmap(rfm_data, center=0,annot=True,cmap="YlGnBu");
plt.xlabel("Actual Label")
plt.ylabel("Predicted Label");

# Logistic Regression

In [None]:
#accuracy metrics of logistic
print(f"Accuracy Score ~ :{accuracy_score(y_test,y_pred_lrm)*100:.2f}%")

In [None]:
#precision score of logistic
print(f"Precision Score~ :{precision_score(y_test,y_pred_lrm)*100:.2f}%")

In [None]:
#classification report
print(classification_report(y_test,y_pred_lrm))

In [None]:
# Confusion Matrix - It's compare to the label model predict and the actual label it suppossed to predict, 
# its offer an ideal where the model is getting confused.
lrm_data=confusion_matrix(y_test,y_pred_lrm)
sns.set(font_scale=1)
sns.heatmap(lrm_data, center=0,annot=True,cmap="YlOrBr");
plt.xlabel("Actual Label")
plt.ylabel("Predicted Label");

# Turnning Hyperparameters ~ LogisticRegression()
 *  The model needs to be tuned as ~91% corrected predicted values needs improvement
 *  RandomizedSearchCV can sample a given number of candidates from a parameter space with a specified distribution

In [None]:
# optimal parameters using LogisticRegression() for classification
random_grid = {"C": np.logspace(-4,4,20),
               "solver" : ["liblinear"]
               }

#displaying the random grid parameters for the estimator ~ Logistic Regression
random_grid

In [None]:
%%time
# Use the random grid to search for optomised hyperparameters for LogisticRegression()
rf = LogisticRegression()

# Random search of parameters, using 3 fold cross validation,and search across 2 different combinations
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, 
                               n_iter = 10, cv = 3, verbose=True)

# Fitting the RandomizedSearchCV model
rf_random.fit(X_train, y_train)

In [None]:
#Optimised parameters
rf_random.best_params_

In [None]:
# fitting the LogisticRegression() model with optimsed parameters
model_lrm_ideal=LogisticRegression(C=545.5594781168514,
                                   solver='liblinear',
                                    verbose=True)
#fitting the model
model_lrm_ideal.fit(X_train,y_train)

In [None]:
#scoring the ideal LogisticRegression() Model
model_lrm_ideal.score(X_test,y_test)

In [None]:
#predicting data through LogisticRegression() Model
y_pred_lrm_ideal=model_lrm_ideal.predict(X_test)
y_pred_lrm_ideal

In [None]:
#accuracy score of post optimization of LogisticRegression() Model
print(f"Accuracy Score~ :{accuracy_score(y_test,y_pred_lrm_ideal)*100:.2f}%")

# Comparing Actual vs Predicted Fraudlent result

In [None]:
# formatting in the desired format
df_job_pred=pd.DataFrame()
df_job_pred["Actual Fraudulent"]=y_test
df_job_pred["Predicted Fraudulent"]=y_pred_rfm
df_job_pred.to_csv("/kaggle/working/predict.csv")

# Feature Importance Evaluations
*Feature importance to assign a score to input features based on how useful they are in prediction.

In [None]:
#creating dictory to map the column with optimal feature rating
feature_dict=dict(zip((df_job.columns),list(model_rfm.feature_importances_)))
feature_dict

In [None]:
#Visulaization of Important features
feature_df=pd.DataFrame(feature_dict,index=[0])
feature_df.T.plot.line(title="EmploymentScamAegean Dataset - Feature Importance",legend=False,color='orange');