In [2]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import sqlite3
from sqlite3 import Error
import csv
import matplotlib.pyplot as plt
import seaborn as sns


import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from sklearn.metrics import r2_score

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [33]:
def create_connection(db_file, delete_db=False):
    import os
    if delete_db and os.path.exists(db_file):
        os.remove(db_file)

    conn = None
    try:
        conn = sqlite3.connect(db_file)
        conn.execute("PRAGMA foreign_keys = 1")
    except Error as e:
        print(e)

    return conn


def create_table(conn, create_table_sql, drop_table_name=None):
    
    if drop_table_name: # You can optionally pass drop_table_name to drop the table. 
        try:
            c = conn.cursor()
            c.execute("""DROP TABLE IF EXISTS %s""" % (drop_table_name))
        except Error as e:
            print(e)
    
    try:
        c = conn.cursor()
        c.execute(create_table_sql)
    except Error as e:
        print(e)
        
def execute_sql_statement(sql_statement, conn):
    cur = conn.cursor()
    cur.execute(sql_statement)

    rows = cur.fetchall()

    return rows

In [4]:
filename = "fake_job_postings.csv"

In [5]:
with open(filename) as file:
    raw_data = list(csv.reader(file, delimiter=","))
raw_data = raw_data[1:]
raw_data1 = [tuple(i) for i in raw_data]

In [6]:
# raw_data = []
# with open(filename) as file:
#     for line in file:
#         if not line.strip():
#             continue
#         raw_data.append(line.strip())

# lst = [i.split(',') for i in raw_data]
# lst = lst[1:]
# lst1 = [tuple(i) for i in lst]

In [7]:
db_file = 'raw_data.db'
import os
if os.path.exists(db_file):
    os.remove(db_file)
conn = create_connection(db_file)

create_table_sql1 = """
    CREATE TABLE IF NOT EXISTS [raw_data] (
        [job_id] integer not null primary key,
        [title] text,
        [location] text,
        [department] text,
        [salary_range] text,
        [company_profile] text,
        [description] text,
        [requirements] text,
        [benefits] text,
        [telecommuting] integer,
        [has_company_logo] integer,
        [has_questions] integer,
        [employment_type] text,
        [required_experience] text,
        [required_education] text,
        [industry] text,
        [function] text,
        [fraudulent] integer
    );
    """

def insert_raw_data(conn, values):
    sql = ''' INSERT INTO raw_data(job_id,title,location,
                department,salary_range,company_profile,description,
                requirements,benefits,telecommuting,has_company_logo,
                has_questions,employment_type,required_experience,
                required_education,industry,function,fraudulent) 
                VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?) '''
    cur = conn.cursor()
    cur.execute(sql, values)
    return cur.lastrowid

with conn:
        create_table(conn, create_table_sql1)
        for values in raw_data1:
            insert_raw_data(conn,values)

In [8]:
db_file_norm = 'normalized_data.db'
if os.path.exists(db_file_norm):
    os.remove(db_file_norm)
conn1 = create_connection(db_file_norm)

In [9]:
sql_statement = 'SELECT distinct company_profile, has_company_logo,count(job_id) from raw_data group by company_profile,has_company_logo order by company_profile'
df = pd.read_sql_query(sql_statement, conn)
display(df)

Unnamed: 0,company_profile,has_company_logo,count(job_id)
0,,0,2622
1,,1,686
2,Value Added Team of Creative ProfessionalsNet...,1,1
3,"""Only stupid questions create wealth"" - Gary ...",1,1
4,"""Our mission to our clients is to preserve the...",1,64
...,...,...,...
1706,Το #URL_bb79581a561837ad604b8fc4ab629753b36407...,1,4
1707,"УУРРАА - технологическая компания, работающая ...",1,9
1708,“News360 is Changing the Content Delivery Game...,1,1
1709,"“No surveys, social data can answer your quest...",1,1


In [10]:
l1 = df.iloc[2:,0].values
l1.shape

(1709,)

In [11]:
np.unique(l1).shape

(1709,)

In [12]:
sql_statement = 'SELECT distinct department, count(job_id) from raw_data group by department'
df = pd.read_sql_query(sql_statement, conn)
df.head(50)

Unnamed: 0,department,count(job_id)
0,,11547
1,,6
2,\tCorporate Shared Services,1
3,Lower Level Management,1
4,Marketing,1
5,Moni Technologies,1
6,R&D,1
7,(Consultant),1
8,.NET,1
9,.net Development,1


- title - title
- department - distinct department
- salary - id, distinct salary(min max)
- description - description
- requirements - id, req
- benefits - id,benefits

- telecommuting
- has_company_logo
- has_questions
- employment_type - 
- required_experience
- required_education
- industry


- function
- fraudulent
------ job_title_loc: job_id, title, locid, titleid, salaryid
- company table: companyid, company_profile, has_company_logo
- location table: locid,location - seggregated
- jobid - companyid,  ###company_profile


- desc and company change
- posting: jobid,companyid, titleid,locID,deptID,salaryID, telecommuting,has_questions, employmentid, req_exp_id,req_edu_id,industryid,functionID, requirementID, benefitID,fraudulent



- DescriptionID, reqID, BenefitID, telecommuting, has_questions,employment_typeID,required_experience,required_education,industry,function
- 

In [13]:
# with conn:
#     sql_statement = 'SELECT distinct title from raw_data'
#     value_profile = execute_sql_statement(sql_statement, conn)

sql_statement = """
    SELECT count(*) FROM raw_data WHERE (job_id+title+location+department+salary_range+company_profile+description+requirements+benefits+telecommuting+has_company_logo+has_questions+employment_type+required_experience+required_education+industry+function+fraudulent) = ' '
    """

df = pd.read_sql_query(sql_statement, conn)
df

Unnamed: 0,count(*)
0,0


In [36]:
with conn: 
    sql_statement = "SELECT distinct function from raw_data"
    value_profile = execute_sql_statement(sql_statement, conn)

create_table_sql2 = """
    CREATE TABLE IF NOT EXISTS [function] (
        [functionID] integer not null primary key,
        [function] text not null
    );
    """

def insert_profile(conn, values):
    sql = ''' INSERT INTO function(function) 
                VALUES(?) '''
    cur = conn.cursor()
    cur.execute(sql, values)
    return cur.lastrowid

   

with conn1:
    create_table(conn1, create_table_sql2,'function')
    for values in value_profile:
        insert_profile(conn1,values)

In [15]:
with conn: 
    sql_statement = "SELECT distinct fraudulent from raw_data where fraudulent != ''"
    value_profile = execute_sql_statement(sql_statement, conn)

create_table_sql2 = """
    CREATE TABLE IF NOT EXISTS [fraudulent] (
        [fraudulent] integer not null primary key
    );
    """

def insert_profile(conn, values):
    sql = ''' INSERT INTO fraudulent(fraudulent) 
                VALUES(?) '''
    cur = conn.cursor()
    cur.execute(sql, values)
    return cur.lastrowid

   

with conn1:
    create_table(conn1, create_table_sql2,'fraudulent')
    for values in value_profile:
        insert_profile(conn1,values)

In [16]:
sql_show_tables = """SELECT name FROM sqlite_master  
WHERE type='table'"""
res = execute_sql_statement(sql_show_tables,conn1)
print(res)

[('function',), ('fraudulent',)]


In [17]:
# with conn1: 
#     sql_statement = "DROP table Title"
#     execute_sql_statement(sql_statement, conn1)

In [18]:
# company table: companyid, company_profile, has_company_logo

with conn: 
    sql_statement = "SELECT distinct company_profile,has_company_logo from raw_data where company_profile != ''"
    value_profile = execute_sql_statement(sql_statement, conn)

create_table_sql2 = """
    CREATE TABLE IF NOT EXISTS [Company] (
        [CompanyID] integer not null primary key,
        [company_profile] text not null,
        [has_company_logo] integer not null
    );
    """

def insert_profile(conn, values):
    sql = ''' INSERT INTO Company(company_profile,has_company_logo) 
                VALUES(?,?) '''
    cur = conn.cursor()
    cur.executemany(sql, values)
    return cur.lastrowid

   

with conn1:
    create_table(conn1, create_table_sql2,'company')
    insert_profile(conn1,value_profile)

In [19]:
# company table: companyid, company_profile, has_company_logo

with conn: 
    sql_statement = "SELECT distinct company_profile,has_company_logo from raw_data where company_profile != ''"
    value_profile = execute_sql_statement(sql_statement, conn)

create_table_sql2 = """
    CREATE TABLE IF NOT EXISTS [Company] (
        [CompanyID] integer not null primary key,
        [company_profile] text not null,
        [has_company_logo] integer not null
    );
    """

def insert_profile(conn, values):
    sql = ''' INSERT INTO Company(company_profile,has_company_logo) 
                VALUES(?,?) '''
    cur = conn.cursor()
    cur.executemany(sql, values)
    return cur.lastrowid

   

with conn1:
    create_table(conn1, create_table_sql2,'company')
    insert_profile(conn1,value_profile)

In [22]:
# location table: locid,location - seggregated

with conn: 
    sql_statement = "SELECT distinct location from raw_data"
    value_profile = execute_sql_statement(sql_statement, conn)
    
list_country = []
list_state = []
list_city = []
for i in value_profile:
    for j in i:
        if j.split(',')[0].strip() != '':
            list_country.append(j.split(',')[0].strip())
        else:
            list_country.append('')
        if len(j.split(','))!=1:
            if j.split(',')[1].strip() != '':
                list_state.append(j.split(',')[1].strip())
            else:
                list_state.append('')
        else:
            list_state.append('')
        if len(j.split(','))!=1:
            if j.split(',')[2].strip() != '':
                list_city.append(j.split(',')[2].strip())
            else:
                list_city.append('')
        else:
            list_city.append('')

list_loc = [(i,j,k) for i,j,k in zip(list_country,list_state,list_city)]

create_table_sql2 = """
    CREATE TABLE IF NOT EXISTS [Location] (
        [LocationID] integer not null primary key,
        [Country] text not null,
        [State] text not null,
        [City] text not null
    );
    """

def insert_profile(conn, values):
    sql = ''' INSERT INTO Location(Country,State,City) 
                VALUES(?,?,?) '''
    cur = conn.cursor()
    cur.executemany(sql, values)
    return cur.lastrowid

   

with conn1:
    create_table(conn1, create_table_sql2,'Location')
    insert_profile(conn1,list_loc)

In [29]:
# description - description

with conn: 
    sql_statement = "SELECT distinct company_profile,description from raw_data"
    value_profile = execute_sql_statement(sql_statement, conn)
with conn1: 
    sql_statement = "SELECT CompanyID,company_profile from Company"
    value_comp = execute_sql_statement(sql_statement, conn1)
comp_dict = {}
desc_list = []

for row in value_comp:
    key,text = row
    comp_dict[text] = key

for i in value_profile:
    for k1,v1 in comp_dict.items():
        if i[0] == k1:
            desc_list.append((i[1],v1))

create_table_sql2 = """
    CREATE TABLE IF NOT EXISTS [Description] (
        [DescriptionID] integer not null primary key,
        [Description] text not null,
        [CompanyID] integer not null,
        Foreign key(CompanyID) REFERENCES Company(CompanyID)
    );
    """

def insert_profile(conn, values):
    sql = ''' INSERT INTO Description(Description,CompanyID) 
                VALUES(?,?) '''
    cur = conn.cursor()
    cur.executemany(sql, values)
    return cur.lastrowid

   

with conn1:
    create_table(conn1, create_table_sql2,'Description')
    insert_profile(conn1,desc_list)

EDA
- function & fraudulent
- 

In [23]:
df = pd.read_csv("fake_job_postings.csv")

In [24]:
df.head()

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0


In [25]:
df.shape

(17880, 18)

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17880 entries, 0 to 17879
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   job_id               17880 non-null  int64 
 1   title                17880 non-null  object
 2   location             17534 non-null  object
 3   department           6333 non-null   object
 4   salary_range         2868 non-null   object
 5   company_profile      14572 non-null  object
 6   description          17879 non-null  object
 7   requirements         15185 non-null  object
 8   benefits             10670 non-null  object
 9   telecommuting        17880 non-null  int64 
 10  has_company_logo     17880 non-null  int64 
 11  has_questions        17880 non-null  int64 
 12  employment_type      14409 non-null  object
 13  required_experience  10830 non-null  object
 14  required_education   9775 non-null   object
 15  industry             12977 non-null  object
 16  func