In [1]:
import pandas as pd
import pandas_profiling
import sklearn as sk
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
import os
import random

seed=1
os.environ['PYTHONHASHSEED'] = str(seed)
random.seed(seed)

df = pd.read_csv('fake_job_postings.csv')
df.head()

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17880 entries, 0 to 17879
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   job_id               17880 non-null  int64 
 1   title                17880 non-null  object
 2   location             17534 non-null  object
 3   department           6333 non-null   object
 4   salary_range         2868 non-null   object
 5   company_profile      14572 non-null  object
 6   description          17879 non-null  object
 7   requirements         15185 non-null  object
 8   benefits             10670 non-null  object
 9   telecommuting        17880 non-null  int64 
 10  has_company_logo     17880 non-null  int64 
 11  has_questions        17880 non-null  int64 
 12  employment_type      14409 non-null  object
 13  required_experience  10830 non-null  object
 14  required_education   9775 non-null   object
 15  industry             12977 non-null  object
 16  func

In [3]:
# pandas_profiling.ProfileReport(df)

In [4]:
# df['title'] = df['title'].fillna('Unknown')
# df['location'] = df['location'].fillna('Unknown')
# df['department'] = df['department'].fillna('Unknown')
# df['company_profile'] = df['company_profile'].fillna('Unknown')
# df['requirements'] = df['requirements'].fillna('University degree required.')
# df['benefits'] = df['benefits'].fillna('See job description')
# df['employment_type'] = df['employment_type'].fillna('Other')
# df['required_experience'] = df['required_experience'].fillna('Entry level')
# df['required_education'] = df['required_education'].fillna('Bachelor\'s Degree')
# df['industry'] = df['industry'].fillna('Unknown')
# df['function'] = df['function'].fillna('Unknown')


# df.info()

In [5]:
df.columns

Index(['job_id', 'title', 'location', 'department', 'salary_range',
       'company_profile', 'description', 'requirements', 'benefits',
       'telecommuting', 'has_company_logo', 'has_questions', 'employment_type',
       'required_experience', 'required_education', 'industry', 'function',
       'fraudulent'],
      dtype='object')

In [6]:
df['function'].value_counts()
stem = ['Information Technology', 'Engineering', 'Data Analyst', 'Science', 'Business Analyst', 'Quality Assurance']
non_stem = ['Sales', 'Customer Service', 'Marketing', 'Administrative', 'Design', 'Human Resources', 'Unknown']
df_stem = df[df['function'].isin(stem)]
df_non_stem = df[df['function'].isin(non_stem)]
print(df_stem.shape)
print(df_non_stem.shape)

df_non_stem['fraudulent'].value_counts()

(3388, 18)
(4702, 18)


0    4453
1     249
Name: fraudulent, dtype: int64

In [7]:
import matplotlib
import tkinter
matplotlib.use('TkAgg')

stem_zero = df_stem['fraudulent'].value_counts()[0]
stem_one = df_stem['fraudulent'].value_counts()[1]
non_stem_zero = df_non_stem['fraudulent'].value_counts()[0]
non_stem_one = df_non_stem['fraudulent'].value_counts()[1]

labels = 'Real', 'Fake'
sizes = [stem_zero + non_stem_zero, stem_one + non_stem_one]
explode = (0, 0.1)  

fig1, ax1 = plt.subplots(figsize = (5, 5))
ax1.pie(sizes, explode = explode, labels = labels, autopct = '%1.1f%%',shadow = True, startangle = 0)
ax1.axis('equal')  
plt.title('Percentage of Real and Fake Job Postings')
plt.legend()
plt.show()

In [8]:
df_num = df[['telecommuting','has_company_logo','has_questions','fraudulent','salary_range']]
df_cat = df[['title', 'location','company_profile', 'requirements','employment_type',
       'required_experience', 'required_education', 'industry', 'function']]

#checking outliers
plt.figure(figsize = [8,4])
sns.boxplot(data = df_num)
plt.show()

In [9]:
#Removing Outliers from columns
df_num = df_num[df_num['telecommuting'] < 0.9 ]
df_num = df_num[df_num['fraudulent'] < 0.9 ]
df_num = df_num[df_num['has_company_logo'] > 0.1 ]
df_num

Unnamed: 0,telecommuting,has_company_logo,has_questions,fraudulent,salary_range
0,0,1,0,0,
1,0,1,0,0,
2,0,1,0,0,
3,0,1,0,0,
4,0,1,1,0,
...,...,...,...,...,...
17872,0,1,0,0,
17873,0,1,0,0,
17875,0,1,1,0,
17876,0,1,1,0,


In [10]:
#Dropping records with null values
df.dropna(axis = 0, how = 'any', inplace = True)
df

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
6,7,Head of Content (m/f),"DE, BE, Berlin",ANDROIDPIT,20000-28000,"Founded in 2009, the Fonpit AG rose with its i...",Your Responsibilities: Manage the English-spea...,Your Know-How: ...,Your Benefits: Being part of a fast-growing co...,0,1,1,Full-time,Mid-Senior level,Master's Degree,Online Media,Management,0
15,16,VP of Sales - Vault Dragon,"SG, 01, Singapore",Sales,120000-150000,Jungle Ventures is the leading Singapore based...,About Vault Dragon Vault Dragon is Dropbox for...,Key Superpowers3-5 years of high-pressure sale...,"Basic: SGD 120,000Equity negotiable for a rock...",0,1,1,Full-time,Executive,Bachelor's Degree,Facilities Services,Sales,0
23,24,"Vice President, Sales and Sponsorship (Busines...","US, CA, Carlsbad",Businessfriend.com,100000-120000,"WDM Group is an innovative, forward thinking d...",#URL_eda2500ddcedb60957fcd7f5b164e092966f8c4e8...,"Job Requirements:A reputation as a ""go-getter""...",Businessfriend will offer a competitive six fi...,0,1,0,Full-time,Executive,Unspecified,Internet,Sales,0
98,99,IC&E Technician,"US, , Stocton, CA",Oil & Energy,95000-115000,...,"IC&amp;E Technician | Bakersfield, CA Mt. Poso...","QualificationsKnowledge, Skills &amp; Abilitie...",BENEFITSWhat is offered:Competitive compensati...,0,1,1,Full-time,Mid-Senior level,High School or equivalent,Oil & Energy,Other,1
102,103,Marketing Administrator,"GB, WAR, Coventry",Marketplace,15000-18000,Renewable Energy and Environmental Protection ...,The job is to support the growth of the #URL_9...,"Computer literateAble to work with HTML, altho...",For a suitably motivated and success orientate...,1,1,0,Full-time,Entry level,Bachelor's Degree,Internet,Marketing,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17734,17735,Customer Service Representative,"US, TX, Houston",Customer Service,30000-50000,Gary Cartwright established Cartwright Propert...,We are Seeking a candidate whose core values i...,Home Computer with Internet AccessBasic Comput...,"Weekly pay, PTO, Paid Holidays, 401 k",1,1,0,Full-time,Entry level,High School or equivalent,Real Estate,Customer Service,1
17759,17760,Customer Service Representative,"US, NY, New York",Customer Service,30000-50000,Gary Cartwright established Cartwright Propert...,We are Seeking a candidate whose core values i...,Home Computer with Internet AccessBasic Comput...,"Weekly pay, PTO, Paid Holidays, 401 k",1,1,0,Full-time,Entry level,High School or equivalent,Real Estate,Customer Service,1
17813,17814,Customer Service Representative,"US, PA, Philadelphia",Customer Service,30000-50000,Gary Cartwright established Cartwright Propert...,We are Seeking a candidate whose core values i...,Home Computer with Internet AccessBasic Comput...,"Weekly pay, PTO, Paid Holidays, 401 k",1,1,0,Full-time,Entry level,High School or equivalent,Real Estate,Customer Service,1
17849,17850,Communication Designer,"US, CA, San Francisco",Design,80000-100000,Balanced Labs exists to provide accountants an...,Balanced Labs exists to improve the lives of a...,"Specifically, we’re looking for:5+ years of vi...","We offer great salaries, share options, and a ...",0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Design,0


In [11]:
# Plots to see the distribution of the continuous features individually

plt.figure(figsize = (25, 20))
plt.subplot(3, 3, 1)
plt.hist(df.employment_type, color = 'orange', edgecolor = 'black', alpha = 0.7)
plt.xlabel('\nEmployment type')

plt.subplot(3, 3, 2)
plt.hist(df.required_experience, color = 'lightblue', edgecolor = 'black', alpha = 0.7)
plt.xlabel('\nRequired Experience')

plt.subplot(3, 3, 3)
plt.hist(df.fraudulent, color = 'red', edgecolor = 'black', alpha = 0.7)
plt.xlabel('\nFraud')
plt.show()

In [12]:
# form correlation matrix

dt_copy = df.copy()
dt_copy['title'] = dt_copy.title.astype("category").cat.codes
dt_copy['location'] = dt_copy.location.astype("category").cat.codes
dt_copy['department'] = dt_copy.department.astype("category").cat.codes
dt_copy['salary_range'] = dt_copy.salary_range.astype("category").cat.codes
dt_copy['company_profile'] = dt_copy.company_profile.astype("category").cat.codes
dt_copy['description'] = dt_copy.description.astype("category").cat.codes
dt_copy['requirements'] = dt_copy.requirements.astype("category").cat.codes
dt_copy['benefits'] = dt_copy.benefits.astype("category").cat.codes
dt_copy['employment_type'] = dt_copy.employment_type.astype("category").cat.codes
dt_copy['required_experience'] = dt_copy.required_experience.astype("category").cat.codes
dt_copy['required_education'] = dt_copy.required_education.astype("category").cat.codes
dt_copy['industry'] = dt_copy.industry.astype("category").cat.codes
dt_copy['function'] = dt_copy.function.astype("category").cat.codes
matrix = dt_copy.corr(method ='pearson')
print("Correlation Matrix: ")
print(matrix)

Correlation Matrix: 
                       job_id     title  location  department  salary_range  \
job_id               1.000000  0.045365  0.029042    0.087416      0.051374   
title                0.045365  1.000000 -0.041537    0.140145      0.017144   
location             0.029042 -0.041537  1.000000   -0.094293      0.102522   
department           0.087416  0.140145 -0.094293    1.000000      0.067193   
salary_range         0.051374  0.017144  0.102522    0.067193      1.000000   
company_profile     -0.046717  0.008708 -0.262855    0.063051     -0.048959   
description          0.048169  0.097049 -0.129389   -0.125230      0.012196   
requirements         0.058185  0.055661 -0.065886   -0.042907     -0.097373   
benefits             0.019649 -0.060901 -0.160872    0.071718     -0.031215   
telecommuting       -0.003304  0.043511  0.124558    0.044270     -0.043754   
has_company_logo    -0.040015  0.016629 -0.109074    0.092284     -0.041631   
has_questions        0.004496 -

In [13]:
corr_matrix=dt_copy.corr(method ='pearson')
corr_matrix["fraudulent"].sort_values(ascending=False)

fraudulent             1.000000
location               0.268042
industry               0.121915
telecommuting          0.094499
description            0.082930
employment_type        0.049304
benefits               0.047048
salary_range           0.039958
required_education     0.021955
requirements           0.020671
has_questions          0.005072
company_profile       -0.004978
required_experience   -0.063843
title                 -0.064026
has_company_logo      -0.073400
job_id                -0.089942
function              -0.132911
department            -0.134259
Name: fraudulent, dtype: float64

In [14]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn import feature_selection
from sklearn.impute import SimpleImputer

df['department'] = df['department'].fillna(df['department'].mode()[0])

data_cat_imp = SimpleImputer(strategy = "constant", fill_value = "Missing")
cat_imp_feature = ["title", "location", "department", "salary_range", "company_profile", "description", "requirements", 
                   "benefits", "employment_type", "required_experience", "required_education", "industry", "function"]

# Filling the Numerical values through existing value
data_num_imp = SimpleImputer(strategy = "constant", fill_value = None)
num_imp_feature = ["job_id", "telecommuting", "has_company_logo", "has_questions", "fraudulent"]

# Transforming into column
data_imp_trans = ColumnTransformer([("data_cat_imp", data_cat_imp, cat_imp_feature),
                                 ("data_num_imp", data_num_imp, num_imp_feature)])

# Transforming and assigning the data
transformed_data = data_imp_trans.fit_transform(df)
transformed_data
#Transforming the data into data frame
df_job_transformed_data = pd.DataFrame(transformed_data,
                         columns=["title", "location", "department", "salary_range", "company_profile", "description",
                                  "requirements", "benefits", "employment_type", "required_experience", "required_education",
                                  "industry", "function", "job_id", "telecommuting", "has_company_logo", "has_questions",
                                  "fraudulent"])

X_trans = df_job_transformed_data.drop("fraudulent", axis = 1)
y_trans = df_job_transformed_data.fraudulent
y_trans = y_trans.astype('int')

X_trans.shape, y_trans.shape, X_trans.columns

# Instantation of One Hot Encoder for categorical data tarnsformatio into Numeric 
one_hot = OneHotEncoder()
clf_trans = ColumnTransformer([("one_hot", one_hot, cat_imp_feature)], remainder = "passthrough")
X_trans_fin = clf_trans.fit_transform(X_trans)
np.array(X_trans_fin)

X_train, X_test, y_train, y_test = train_test_split(X_trans_fin, y_trans, test_size=0.30, random_state = 1)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((541, 3563), (233, 3563), (541,), (233,))

In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

model_rfm = RandomForestClassifier(random_state = 42)
model_rfm.fit(X_train, y_train)
y_pred_rfm = model_rfm.predict(X_test)
rfc_accuracy = model_rfm.score(X_test, y_test)
precision = precision_score(y_test, y_pred_rfm)
recall = recall_score(y_test, y_pred_rfm)
 
print(f"Accuracy: {rfc_accuracy*100:.2f}%")
print(f"Precision: {precision*100:.2f}%")
print(f"Recall: {recall*100:.2f}%")

Accuracy: 96.14%
Precision: 100.00%
Recall: 60.87%


In [16]:
#hyperparameter tuning
n_trees = [10, 50, 100, 200, 300]
for i in n_trees:
    ran_for = RandomForestClassifier(n_estimators = i)
    ran_for.fit(X_train, y_train)
    pred = ran_for.predict(X_test)
    
    print('Number of trees: {}'.format(i))
    #Each time of prediction,the accuracy is measured
    correct_pred = 0
    for j, k in zip(y_test, pred):
        if j == k:
            correct_pred += 1
    print('Correct predictions: {}'.format(correct_pred/len(y_test) *100))
    print('-------------------------------------------------------------------')

Number of trees: 10
Correct predictions: 94.84978540772532
-------------------------------------------------------------------
Number of trees: 50
Correct predictions: 96.99570815450643
-------------------------------------------------------------------
Number of trees: 100
Correct predictions: 96.56652360515021
-------------------------------------------------------------------
Number of trees: 200
Correct predictions: 96.56652360515021
-------------------------------------------------------------------
Number of trees: 300
Correct predictions: 96.56652360515021
-------------------------------------------------------------------


In [17]:
from sklearn import metrics
import scikitplot as skplt

confusion_matrix = metrics.confusion_matrix(y_test, y_pred_rfm)

cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = [False, True])

cm_display.plot()
plt.title('Confusion Matrix - Random Forest')
plt.show()

In [18]:
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, roc_curve
import scikitplot as skplt

fpr, tpr, thresholds = roc_curve(y_test, y_pred_rfm)
# skplt.metrics.plot_roc(np.array(y_test), y_pred_rfm)


plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC - Random Forest')
plt.show()
print(f'AUC score - Random Forest: {roc_auc_score(y_test, y_pred_rfm)}')

AUC score - Random Forest: 0.8043478260869565


In [19]:
feature_dict = dict(zip((df.columns), list(model_rfm.feature_importances_)))

log_val = []
for i in feature_dict.values():
    log_val.append(np.log(i))

log_val = np.nan_to_num(log_val, neginf=0)
log_val = [i * (-1) for i in log_val]
names = list(feature_dict.keys())
dictionary = dict(zip(names, log_val))

sorted_dict = dict(sorted(dictionary.items(), key = lambda item: item[1], reverse = True))
names = []
values = []
for k, v in sorted_dict.items():
    if v != -0.0:
        names.append(k)
        values.append(v)
        
plt.figure(figsize = (9, 5))
plt.barh(range(len(values)), values, tick_label=names)
plt.title('Feature importance')
plt.show()

  log_val.append(np.log(i))


In [21]:
#Ensemble Methods
#Averaging Method

from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn import svm

# initializing all the model objects with default parameters
model_1 = LogisticRegression()
model_2 = xgb.XGBRegressor()
model_3 = RandomForestRegressor()
model_4 = svm.SVC(kernel = 'linear', random_state = 0, C=1.0)
 
# training all the model on the training dataset
model_1.fit(X_train, y_train)
model_2.fit(X_train, y_train)
model_3.fit(X_train, y_train)
model_4.fit(X_train, y_train)
 
# predicting the output on the validation dataset
pred_1 = model_1.predict(X_test)
pred_2 = model_2.predict(X_test)
pred_3 = model_3.predict(X_test)
pred_4 = model_4.predict(X_test)
 
# final prediction after averaging on the prediction of all 3 models
pred_final = (pred_1 + pred_2 + pred_3 + pred_4) / 4.0

# printing the mean squared error between real value and predicted value
print("Averaging Method: MSE")
print(mean_squared_error(y_test, pred_final))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Averaging Method: MSE
0.01709889984954859


In [22]:
#Max Voting Method

from sklearn.metrics import log_loss
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn import svm

# initializing all the model objects with default parameters
model_1 = LogisticRegression()
model_2 = XGBClassifier()
model_3 = RandomForestClassifier()
model_4 = svm.SVC(kernel = 'linear', random_state = 0, C=1.0)

# Making the final model using voting classifier
final_model = VotingClassifier(
    estimators=[('lr', model_1), ('xgb', model_2), ('rf', model_3), ('svm', model_4)], voting='hard')
 
# training all the model on the train dataset
final_model.fit(X_train, y_train)
 
# predicting the output on the test dataset
pred_final = final_model.predict(X_test)
 
rfc_accuracy = final_model.score(X_test, y_test)
precision = precision_score(y_test, pred_final)
recall = recall_score(y_test, pred_final)
 
# printing log loss between actual and predicted value
print("Max Voting Method: Log Loss")
print(log_loss(y_test, pred_final))
print(f"Accuracy: {rfc_accuracy*100:.2f}%")
print(f"Precision: {precision*100:.2f}%")
print(f"Recall: {recall*100:.2f}%")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Max Voting Method: Log Loss
1.3341158264128596
Accuracy: 96.14%
Precision: 100.00%
Recall: 60.87%


In [None]:
# #Stacking Method

# from vecstack import stacking

# # initializing all the base model objects with default parameters
# model_1 = LogisticRegression()
# model_2 = xgb.XGBRegressor()
# model_3 = RandomForestRegressor()
 
# # putting all base model objects in one list
# all_models = [model_1, model_2, model_3]
 
# # computing the stack features
# s_train, s_test = stacking(all_models, X_train, X_test,
#                            y_train, regression=True, n_folds=4)
 
# # initializing the second-level model
# final_model = model_1
 
# # fitting the second level model with stack features
# final_model = final_model.fit(s_train, y_train)
 
# # predicting the final output using stacking
# pred_final = final_model.predict(X_test)
 
# # printing the mean squared error between real value and predicted value
# print(mean_squared_error(y_test, pred_final))

In [None]:
# #Splitting between train data into training and validation dataset
# X_train, X_test, y_train, y_test = train_test_split(X_trans_fin, y_trans, test_size=0.30)
 
# # performing the train test and validation split
# train_ratio = 0.70
# validation_ratio = 0.20
# test_ratio = 0.10
 
# # performing train test split
# x_train, x_test, y_train, y_test = train_test_split(
#     X_trans_fin, y_trans, test_size=1 - train_ratio)
 
# # performing test validation split
# x_val, x_test, y_val, y_test = train_test_split(
#     x_test, y_test, test_size=test_ratio/(test_ratio + validation_ratio))

# # initializing all the base model objects with default parameters
# model_1 = LogisticRegression()
# model_2 = xgb.XGBRegressor()
# model_3 = RandomForestRegressor()
 
# # training all the model on the train dataset
 
# # training first model
# model_1.fit(x_train, y_train)
# val_pred_1 = model_1.predict(x_val)
# test_pred_1 = model_1.predict(x_test)
 
# # converting to dataframe
# val_pred_1 = pd.DataFrame(val_pred_1)
# test_pred_1 = pd.DataFrame(test_pred_1)
 
# # training second model
# model_2.fit(x_train, y_train)
# val_pred_2 = model_2.predict(x_val)
# test_pred_2 = model_2.predict(x_test)
 
# # converting to dataframe
# val_pred_2 = pd.DataFrame(val_pred_2)
# test_pred_2 = pd.DataFrame(test_pred_2)
 
# # training third model
# model_3.fit(x_train, y_train)
# val_pred_3 = model_1.predict(x_val)
# test_pred_3 = model_1.predict(x_test)
 
# # converting to dataframe
# val_pred_3 = pd.DataFrame(val_pred_3)
# test_pred_3 = pd.DataFrame(test_pred_3)
 
# # concatenating validation dataset along with all the predicted validation data (meta features)
# df_val = pd.concat([x_val, val_pred_1, val_pred_2, val_pred_3], axis=1)
# df_test = pd.concat([x_test, test_pred_1, test_pred_2, test_pred_3], axis=1)
 
# # making the final model using the meta features
# final_model = LinearRegression()
# final_model.fit(df_val, y_val)
 
# # getting the final output
# final_pred = final_model.predict(df_test)
 
# #printing the mean squared error between real value and predicted value
# print(mean_squared_error(y_test, pred_final))

In [None]:
# from sklearn.ensemble import BaggingRegressor

# # initializing the bagging model using XGboost as base model with default parameters
# model = BaggingRegressor(base_estimator=xgb.XGBRegressor())
 
# # training model
# model.fit(X_train, y_train)
 
# # predicting the output on the test dataset
# pred = model.predict(X_test)
 
# # printing the mean squared error between real value and predicted value
# print(mean_squared_error(y_test, pred))