In [1]:
import pandas as pd
import sqlite3
import numpy as np
# read in all the tables into their own dataframes
pd.options.display.max_rows = 999
pd.options.display.max_colwidth = 999
random_seed = 100
np.random.seed(random_seed)

# Path to your SQLite database
database_path = '/home/agartside/CustomJobAlertSystem/DataBase/google_database.db'

# Create a connection to the database
conn = sqlite3.connect(database_path)

# Get a list of all tables in the database
table_query = "SELECT name FROM sqlite_master WHERE type='table';"
tables = pd.read_sql_query(table_query, conn)

# Create a dictionary to hold dataframes
dataframes = {}

# Load data from each table into a separate DataFrame
for table in tables['name']:
    query = f"SELECT * FROM {table}"
    dataframes[table] = pd.read_sql_query(query, conn)

# Close the connection
conn.close()

# Now, 'dataframes' is a dictionary with table names as keys and corresponding DataFrames as values
# For example, to access a dataframe, use dataframes['table_name']


In [2]:
# add new column to each dataframe of the job category
for table in tables['name']:
    dataframe = dataframes[table]
    dataframe['Category'] = table[:-5]
    print(dataframe.head())

# Add all the dataframes together into 1 dataframe
job_listings_dataframe = pd.concat([dataframes[table] for table in tables['name']],ignore_index = False)
job_listings_dataframe.head()

job_listings_dataframe = job_listings_dataframe[['JobTitle','Description','Qualifications','Responsibilities','Category']]
job_listings_dataframe = job_listings_dataframe[job_listings_dataframe['Category'] != 'web_developer']
job_listings_dataframe = job_listings_dataframe.sample(frac = 1).reset_index(drop = True)

def remove_whitespace(text):
    return " ".join(text.split())

def clean_data(dataframe):
    # Regex pattern to match various symbols including "•", ",", ".", "\n", "…", "-", and other non-alphanumeric characters
    pattern = r'[•,\n\\…\-\/\(\)\"\!\?\:\;\@\#\$\%\^\&\*\_\+\=\[\]\{\}\|\<\>\`~]'
    for column in dataframe.columns:
        dataframe.loc[:,column] = dataframe[column].astype(str)
        dataframe.loc[:,column] = dataframe[column].str.replace(pattern, ' ', regex = True)
        dataframe.loc[:,column] = dataframe[column].str.lower()
        dataframe.loc[:,column] = dataframe[column].apply(remove_whitespace) 
    return dataframe

job_listings_dataframe.head()

   JobID                                                       ListHashing  \
0      1  87614021101e011842bb7a94c4fc4f554a7d205bbef5fe81f43f7eea3989a540   
1      2  c8f9d9f9ce663ef31a1bc48fdbc33049ef4220e59f1b74f880857fb4c747d463   
2      3  c03b0ac2cbde62a6a000e2f9eb747b957ab12650a6a290b3580f19c351846b1e   
3      5  31bff1a9543a6592ab06d020029d633ff696f7ddf7da84cace06fea1b5f90ee5   
4      6  33cc3f411b485c88b7732c92f91772eba04156bcc51ef3a911e5f1176694f023   

                               CompanyTitle  \
0                             ClearanceJobs   
1                                     Adobe   
2                                      Dice   
3                                Cosmogence   
4  Johns Hopkins Applied Physics Laboratory   

                                                                    JobTitle  \
0         Software Engineer I - (Onsite) Summer 2024 with Security Clearance   
1                                            2024 Intern - Software Engineer   
2       

Unnamed: 0,JobTitle,Description,Qualifications,Responsibilities,Category
0,Cyber Security Analyst,"Please feel free to submit your resume for future employment considerations. Email all resumes to info@relevanttechnologies.com with the word ""Resume"" in the subject line.\n\nIf you are interested in an internship, download our internship application...\n\nRelevant Technologies currently has a job opening for a Senior Cyber Security Analyst. The position requires 5 years of experience, a Bachelors degree, and expert knowledge of security audit, compliance, and risk assessment practices. Knowledge of NIST Special Publications related to FISMA compliance, a must, DIACAP desirable. Former experience as a Windows or UNIX systems administrator highly desirable. Knowledge of security products such as firewalls, intrusion detection systems, and access control products etc. is highly desired. CFCPs are preferred. CISM or CRISC may be acceptable for the right candidate. Must have excellent writing and communications skills.\n\nThe position is based in Columbia, Maryland and no relocation i...","•The position requires 5 years of experience, a Bachelors degree, and expert knowledge of security audit, compliance, and risk assessment practices •CISM or CRISC may be acceptable for the right candidate •Must have excellent writing and communications skills",,cybersecurity
1,Cybersecurity & Technology Audit Internship (Summer 2024),"Pay Philosophy The typical starting salary range for this role is determined by a number of factors including skills, experience, education, certifications and location. The full salary range for this role reflects the competitive labor market value for all employees in these positions across the national market and provides an opportunity to progress as employees grow and develop within the... role. Some roles at Liberty Mutual have a corresponding compensation plan which may include commission and/or bonus earnings at rates that vary based on multiple factors set forth in the compensation plan for the role.\n\nDescription A career in finance at Liberty Mutual is more than just balancing assets and liabilities. You'll be using innovative tools and problem-solving skills to fuel your growth and success-and ours. We're dedicated to helping industry-leading finance talent realize their dreams-straight out of college.\n\nSound like you? Read on! The details When you take on the role ...","•You are pursuing a Bachelor's degree in Business Management, Management Information Systems (MIS), Computer Science, or equivalent training, with a minimum 3.0 cumulative GPA •You are detail-oriented and thrive in a fast-paced work environment •You possess strong interpersonal, communication, organizational, and leadership skills •You have experience with technical programs, including a working knowledge of Excel •You are able to commit to a full-time internship over the course of 11-12 weeks","•The details When you take on the role of summer intern with Liberty Mutual's Cybersecurity & Technology Audit department, you're signing on to a collaborative team that's responsible for auditing all of our internal control systems •Your key responsibilities during this rewarding 11-12 internship program will include interviewing personnel, performing testwork, analyzing results, and communicating issues with managers •You'll also participate in audit project teams to complete risk assessments and control evaluations for IT areas under review",cybersecurity
2,Information Technology Intern,"Job Details Level: Entry\nJob Location: Fairfield Location - Fairfield, OH\nSalary Range: Undisclosed...\n\nDescription\n\nThe Application Developer writes and debugs programs for the core database, web server, and network. Performs analysis, design, programming and testing of business applications. Maintains, documents, and provides technical support for systems software, as well as modifying existing and creating new systems software for specific company needs. Inspects conformance of system design to defined architecture standards and models.\nExchanges ideas, information and opinions with the IT Director to arrive at decisions, conclusions, solutions, or solve disputes. Is a company-wide recognized expert and leader and is expected to contribute to the overall company direction.\n\nJob Requirements:\n• HTML, JavaScript, VBScript, XSL, XML, ASP .NET, Visual Basic .NET, SQL Server, Crystal Reports, C#, DevExpress, Boomi, ETL\n• Good understanding of project lifecycle and intimat...","•HTML, JavaScript, VBScript, XSL, XML, ASP .NET, Visual Basic .NET, SQL Server, Crystal Reports, C#, DevExpress, Boomi, ETL •Good understanding of project lifecycle and intimate knowledge of coding, unit testing, and code-review phases •Basic administration and Windows-based infrastructure architecture •Knowledge of IIS and basic Web administration functions •Excellent oral and written communication skills •Competence and proficiency in Microsoft Office software products, including Word and Excel, and other software such as Visio •Ability to communicate and solve problems in a productive, rational, and non-emotional manner, using language and tone of voice that promotes interpersonal relationships •Reliable attendance, a record of completing projects, and organization •Enthusiastic: Shows intense and eager enjoyment and interest •Dedicated: Devoted to a task or purpose with loyalty or integrity •Detail Oriented: Capable of carrying out a given task with all details necessary to ge...","•The Application Developer writes and debugs programs for the core database, web server, and network •Performs analysis, design, programming and testing of business applications •Maintains, documents, and provides technical support for systems software, as well as modifying existing and creating new systems software for specific company needs •Inspects conformance of system design to defined architecture standards and models •Exchanges ideas, information and opinions with the IT Director to arrive at decisions, conclusions, solutions, or solve disputes •Is a company-wide recognized expert and leader and is expected to contribute to the overall company direction •Algorithm and data structure design •These responsibilities are ranked in order of importance to the organization •Leads architectural decisions based on best practices and input from implementation teams •Ensures compliance with architectural design and divisional system integration standards/guidelines •Major contributor...",information_technology
3,"Software Development Intern, Instrumentation","JOB OBJECTIVE: As a Software Development Intern, assist the Promega Software engineering team with implementing, testing and documenting software components and utilities during the summer internship period.\n• Some hybrid work is allowed for this poistion but will be required to work from the office in Santa Clara, CA a couple days per week\n...\nCORE DUTIES:\n\n1. Assist the software engineering team with documenting software components using our automated software documentation tools.\n\n2. Test various software components individually and participate in software components integration / troubleshooting.\n\n3. Participate in configuration / maintenance of our build tools.\n\n4. Help developing auxiliary software utilities or software tools as needed.\n\n5. Assist in evaluating off the shelf software tools and such as installation tools, software controls and libraries.\n\n6. Understands and complies with ethical, legal and regulatory requirements applicable to our business.\n\n...","•Embracing and being open to incorporating Promega’s 6 Emotional & Social Intelligence (ESI) core principles in daily work •Currently pursuing bachelor’s degree in computer science, computer engineering, or related fields •Familiarity with Microsoft .NET, C# •Familiarity with software design and documentation principles •Able to manage multiple projects simultaneously •Ability to effectively work on many priorities at one time, which change frequently •Ability to interact productively with team members, take direction, and follow written directions •Excellent written and verbal communication skills •Ability to use a computer/Microsoft Office applications","•Some hybrid work is allowed for this poistion but will be required to work from the office in Santa Clara, CA a couple days per week •Assist the software engineering team with documenting software components using our automated software documentation tools •Test various software components individually and participate in software components integration / troubleshooting •Participate in configuration / maintenance of our build tools •Help developing auxiliary software utilities or software tools as needed •Assist in evaluating off the shelf software tools and such as installation tools, software controls and libraries •Understands and complies with ethical, legal and regulatory requirements applicable to our business •Help SQA team with software testing and releasing •Help hardware team with providing or setting up software tools / utilities •Demonstrates inclusion through their own words and actions and is accountable for a safe workspace •Acts with kindness, curiosity and respec...",software_development
4,92: IT - Space Systems and Technology - CO-OP Student (September-January),"The Advanced Sensors and Techniques Group operates the Lincoln Space Surveillance Complex (LSSC) that comprises of the Millstone deep-space satellite tracking radar, the Haystack Ultra-wideband Satellite Imaging Radar (HUSIR) and the Haystack Auxiliary imaging radar (HAX). All these radar are remotely controlled from the Lexington Space Situational Awareness Center (LSSAC) which serves as a data... processing and fusion node for the LSSC and other ground and space-based space surveillance sensors. Together, the LSSC and LSSAC serve as an operational test bed for space situational awareness technologies and provide access to a rich set of radar and optical data. The group's current research and development efforts focus on problems such as tracking and identification of newly launched satellites, tracking and discrimination of satellites in geosynchronous clusters, automated radar image exploitation, close approach monitoring and collision warning, applications of mullti-sensor dat...","•The successful candidate must be actively enrolled in a PhD, M.S., or B.S. degree program in Electrical Engineering, Computer Engineering, Physics, Mathematics, Aeronautics/Astronautics Engineering or other relevant technical field •Applicants must have familiarity with C/C++, MATLAB, Python, or other scientific programming languages and skills and experience in one or more of the following areas: signal processing, aerospace applications, remote sensing applications, hardware fabrication, or electronics integration and test •Selected candidate will be subject to a pre-employment background investigation and must be able to obtain and maintain a Secret level DoD security clearance","•All these radar are remotely controlled from the Lexington Space Situational Awareness Center (LSSAC) which serves as a data processing and fusion node for the LSSC and other ground and space-based space surveillance sensors •Together, the LSSC and LSSAC serve as an operational test bed for space situational awareness technologies and provide access to a rich set of radar and optical data •The group's current research and development efforts focus on problems such as tracking and identification of newly launched satellites, tracking and discrimination of satellites in geosynchronous clusters, automated radar image exploitation, close approach monitoring and collision warning, applications of mullti-sensor data fusion, and decision support •The Advanced Sensors and Techniques Group is seeking a CO-OP student to help design and build the next generation of space surveillance sensors and architectures •Activities may include scientific programming, data analysis, hardware developmen...",information_technology


In [3]:
print(len(job_listings_dataframe))

1811


In [4]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import numpy as np

np.random.seed(random_seed)
X_train, X_test_and_val, y_train, y_test_and_val = train_test_split(job_listings_dataframe['JobTitle'],
                                                                    job_listings_dataframe['Category'],
                                                                    test_size = 0.3,
                                                                    stratify=job_listings_dataframe['Category'])

X_test, X_val, y_test, y_val = train_test_split(X_test_and_val, y_test_and_val, test_size = 0.5, stratify = y_test_and_val)


random_forest_pipeline = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')),
                                   ('classifier', RandomForestClassifier(n_jobs=-4))])

label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)
y_val_encoded = label_encoder.transform(y_val)

random_forest_pipeline.fit(X_train, y_train_encoded)


In [5]:
untuned_predictions = []

In [6]:
random_forest_test_predictions = random_forest_pipeline.predict(X_test)
print(classification_report(y_test_encoded,random_forest_test_predictions))
print(accuracy_score(y_test_encoded, random_forest_test_predictions))
untuned_predictions.append(classification_report(y_test_encoded,random_forest_test_predictions))
untuned_predictions.append(accuracy_score(y_test_encoded, random_forest_test_predictions))

              precision    recall  f1-score   support

           0       0.93      0.84      0.88        50
           1       0.96      0.97      0.97        79
           2       0.84      0.88      0.86        73
           3       0.96      0.97      0.96        70

    accuracy                           0.92       272
   macro avg       0.92      0.92      0.92       272
weighted avg       0.92      0.92      0.92       272

0.9227941176470589


In [7]:
random_forest_val_predictions = random_forest_pipeline.predict(X_val)
print(classification_report(y_val_encoded,random_forest_val_predictions))
print(accuracy_score(y_val_encoded, random_forest_val_predictions))
untuned_predictions.append(classification_report(y_val_encoded,random_forest_val_predictions))
untuned_predictions.append(accuracy_score(y_val_encoded, random_forest_val_predictions))

              precision    recall  f1-score   support

           0       0.90      0.90      0.90        50
           1       0.99      0.97      0.98        79
           2       0.93      0.89      0.91        74
           3       0.90      0.96      0.93        69

    accuracy                           0.93       272
   macro avg       0.93      0.93      0.93       272
weighted avg       0.93      0.93      0.93       272

0.9338235294117647


In [8]:
prediction = random_forest_pipeline.predict(["software engineer"])
print(prediction)
print(label_encoder.inverse_transform(prediction))

[3]
['software_development']


In [9]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

np.random.seed(random_seed)

X_train, X_test_and_val, y_train, y_test_and_val = train_test_split(job_listings_dataframe['JobTitle'],
                                                                    job_listings_dataframe['Category'],
                                                                    test_size = 0.3,
                                                                    stratify=job_listings_dataframe['Category'])

X_test, X_val, y_test, y_val = train_test_split(X_test_and_val, y_test_and_val, test_size = 0.5, stratify = y_test_and_val)

# Define the parameter grid for RandomForestClassifier
random_forest_param_grid = {
    'n_estimators': [100, 200, 300, 500, 700, 1000],
    'max_depth': [5, 10, 20, 30, 50, 70, None],
    'min_samples_split': [2, 5, 10, 15, 20],
    'min_samples_leaf': [1, 2, 4, 6, 8],
    'max_features': ['auto', 'sqrt', 'log2'],
    'bootstrap': [True, False],
    'criterion': ['gini', 'entropy']
}


random_forest_grid_search = RandomizedSearchCV(RandomForestClassifier(),
                                                param_distributions = random_forest_param_grid,
                                                n_iter = 500,
                                                cv = 5,
                                                scoring='f1_weighted',
                                                n_jobs = -4)


random_forest_pipeline = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')),
                                   ('classifier', random_forest_grid_search)])

label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)
y_val_encoded = label_encoder.transform(y_val)

random_forest_pipeline.fit(X_train, y_train_encoded)


905 fits failed out of a total of 2500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
390 fits failed with the following error:
Traceback (most recent call last):
  File "/home/agartside/miniconda3/envs/webscraping/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/agartside/miniconda3/envs/webscraping/lib/python3.11/site-packages/sklearn/base.py", line 1145, in wrapper
    estimator._validate_params()
  File "/home/agartside/miniconda3/envs/webscraping/lib/python3.11/site-packages/sklearn/base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "/home/agartside/miniconda3/envs/webscraping/lib/python3

In [10]:
random_forest_test_predictions = random_forest_pipeline.predict(X_test)
print(classification_report(y_test_encoded,random_forest_test_predictions))
print(accuracy_score(y_test_encoded, random_forest_test_predictions))
print(untuned_predictions[0])
print(untuned_predictions[1])

              precision    recall  f1-score   support

           0       0.96      0.86      0.91        50
           1       0.96      0.97      0.97        79
           2       0.87      0.89      0.88        73
           3       0.96      0.99      0.97        70

    accuracy                           0.93       272
   macro avg       0.94      0.93      0.93       272
weighted avg       0.93      0.93      0.93       272

0.9338235294117647
              precision    recall  f1-score   support

           0       0.93      0.84      0.88        50
           1       0.96      0.97      0.97        79
           2       0.84      0.88      0.86        73
           3       0.96      0.97      0.96        70

    accuracy                           0.92       272
   macro avg       0.92      0.92      0.92       272
weighted avg       0.92      0.92      0.92       272

0.9227941176470589


In [11]:
random_forest_val_predictions = random_forest_pipeline.predict(X_val)
print(classification_report(y_val_encoded,random_forest_val_predictions))
print(accuracy_score(y_val_encoded, random_forest_val_predictions))
print(untuned_predictions[2])
print(untuned_predictions[3])

              precision    recall  f1-score   support

           0       0.94      0.88      0.91        50
           1       0.97      0.99      0.98        79
           2       0.93      0.91      0.92        74
           3       0.90      0.96      0.93        69

    accuracy                           0.94       272
   macro avg       0.94      0.93      0.93       272
weighted avg       0.94      0.94      0.94       272

0.9375
              precision    recall  f1-score   support

           0       0.90      0.90      0.90        50
           1       0.99      0.97      0.98        79
           2       0.93      0.89      0.91        74
           3       0.90      0.96      0.93        69

    accuracy                           0.93       272
   macro avg       0.93      0.93      0.93       272
weighted avg       0.93      0.93      0.93       272

0.9338235294117647


In [12]:

# Encode the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(job_listings_dataframe['Category'])

# Fit the pipeline on the full dataset
random_forest_pipeline.fit(job_listings_dataframe['JobTitle'], y_encoded)

# At this point, your pipeline is trained on the entire dataset


875 fits failed out of a total of 2500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
420 fits failed with the following error:
Traceback (most recent call last):
  File "/home/agartside/miniconda3/envs/webscraping/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/agartside/miniconda3/envs/webscraping/lib/python3.11/site-packages/sklearn/base.py", line 1145, in wrapper
    estimator._validate_params()
  File "/home/agartside/miniconda3/envs/webscraping/lib/python3.11/site-packages/sklearn/base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "/home/agartside/miniconda3/envs/webscraping/lib/python3

In [13]:
from joblib import dump

# save model and encoder to file
dump(random_forest_pipeline, 'random_forest_model.joblib')
dump(label_encoder, 'random_forest_encoder.joblib')

['random_forest_encoder.joblib']