In [1]:
import pandas as pd
import sqlite3
import numpy as np
import time
import os
start_time = time.time()
# read in all the tables into their own dataframes
random_seed = 100
np.random.seed(random_seed)

# Path to your SQLite database
current_directory = os.getcwd()
parent_directory = os.path.dirname(current_directory)
database_path = os.path.join(parent_directory,'DataBase','google_database.db')

# Create a connection to the database
conn = sqlite3.connect(database_path)

# Get a list of all tables in the database
table_query = "SELECT name FROM sqlite_master WHERE type='table';"
tables = pd.read_sql_query(table_query, conn)

# Create a dictionary to hold dataframes
dataframes = {}

# Load data from each table into a separate DataFrame
for table in tables['name']:
    query = f"SELECT * FROM {table}"
    dataframes[table] = pd.read_sql_query(query, conn)

# Close the connection
conn.close()

# Now, 'dataframes' is a dictionary with table names as keys and corresponding DataFrames as values
# For example, to access a dataframe, use dataframes['table_name']


In [2]:
# add new column to each dataframe of the job category
for table in tables['name']:
    dataframe = dataframes[table]
    dataframe['Category'] = table[:-5]
    print(dataframe.head())

# Add all the dataframes together into 1 dataframe
job_listings_dataframe = pd.concat([dataframes[table] for table in tables['name']],ignore_index = False)
job_listings_dataframe.head()

job_listings_dataframe = job_listings_dataframe[['JobTitle','Description','Qualifications','Responsibilities','Category']]
job_listings_dataframe = job_listings_dataframe[job_listings_dataframe['Category'] != 'web_developer']
job_listings_dataframe = job_listings_dataframe.sample(frac = 1).reset_index(drop = True)

def remove_whitespace(text):
    return " ".join(text.split())

def clean_data(dataframe):
    # Regex pattern to match various symbols including "•", ",", ".", "\n", "…", "-", and other non-alphanumeric characters
    pattern = r'[•,\n\\…\-\/\(\)\"\!\?\:\;\@\#\$\%\^\&\*\_\+\=\[\]\{\}\|\<\>\`~]'
    for column in dataframe.columns:
        dataframe.loc[:,column] = dataframe[column].astype(str)
        dataframe.loc[:,column] = dataframe[column].str.replace(pattern, ' ', regex = True)
        dataframe.loc[:,column] = dataframe[column].str.lower()
        dataframe.loc[:,column] = dataframe[column].apply(remove_whitespace) 
    return dataframe

job_listings_dataframe.head()

   JobID                                        ListHashing  \
0      1  87614021101e011842bb7a94c4fc4f554a7d205bbef5fe...   
1      2  c8f9d9f9ce663ef31a1bc48fdbc33049ef4220e59f1b74...   
2      3  c03b0ac2cbde62a6a000e2f9eb747b957ab12650a6a290...   
3      5  31bff1a9543a6592ab06d020029d633ff696f7ddf7da84...   
4      6  33cc3f411b485c88b7732c92f91772eba04156bcc51ef3...   

                               CompanyTitle  \
0                             ClearanceJobs   
1                                     Adobe   
2                                      Dice   
3                                Cosmogence   
4  Johns Hopkins Applied Physics Laboratory   

                                            JobTitle  \
0  Software Engineer I - (Onsite) Summer 2024 wit...   
1                    2024 Intern - Software Engineer   
2  Senior Lead Software Engineer, Full Stack (Rem...   
3                     Software Engineer - Internship   
4  2024 Internship - Software Developer - Tactica...   

 

Unnamed: 0,JobTitle,Description,Qualifications,Responsibilities,Category
0,Data Science MS/PhD Internship,Who are we and why should you join us?\n\nBett...,•The ideal candidate would have experience wit...,•You will use data to drive insight into the p...,data_science
1,Cybersecurity Intern,"Overview:\n\nS2 Global, an OSI Systems Company...",•Familiarity with security standards and frame...,"•Focused on Operations, Integration and Traini...",cybersecurity
2,Cyber Security Intern,Why Badger Meter?\n\nBadger Meter is a leading...,"•Strong interpersonal, communication, and docu...",•The Cyber Security Analyst Intern will assist...,cybersecurity
3,Software Development Intern (M&S) - Summer 2024,Tyler Technologies is looking for Software Dev...,•Pursuing a bachelor's degree in Computer Scie...,•The Software Development Intern will experien...,software_development
4,Information Technologies Intern,We Care. We Deliver. Our purpose is to create ...,•Hands on experience with business processes •...,•Assisting in preparing presentations •Analyzi...,information_technology


In [3]:
# Assuming your DataFrame is named job_listings_dataframe
if 'Category' in job_listings_dataframe.columns:
    # Keep only the rows where 'Category' is not 'recent_job_lis'
    job_listings_dataframe = job_listings_dataframe[job_listings_dataframe['Category'] != 'recent_job_lis']


In [4]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import numpy as np

np.random.seed(random_seed)
X_train, X_test_and_val, y_train, y_test_and_val = train_test_split(job_listings_dataframe['JobTitle'],
                                                                    job_listings_dataframe['Category'],
                                                                    test_size = 0.3,
                                                                    stratify=job_listings_dataframe['Category'])

X_test, X_val, y_test, y_val = train_test_split(X_test_and_val, y_test_and_val, test_size = 0.5, stratify = y_test_and_val)


random_forest_pipeline = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')),
                                   ('classifier', RandomForestClassifier(n_jobs=-4))])

label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)
y_val_encoded = label_encoder.transform(y_val)

random_forest_pipeline.fit(X_train, y_train_encoded)


In [5]:
untuned_predictions = []

In [6]:
random_forest_test_predictions = random_forest_pipeline.predict(X_test)
print(classification_report(y_test_encoded,random_forest_test_predictions))
print(accuracy_score(y_test_encoded, random_forest_test_predictions))
untuned_predictions.append(classification_report(y_test_encoded,random_forest_test_predictions))
untuned_predictions.append(accuracy_score(y_test_encoded, random_forest_test_predictions))

              precision    recall  f1-score   support

           0       0.96      0.86      0.91        50
           1       0.98      1.00      0.99        79
           2       0.91      0.95      0.93        73
           3       0.97      0.97      0.97        70

    accuracy                           0.95       272
   macro avg       0.95      0.94      0.95       272
weighted avg       0.95      0.95      0.95       272

0.9522058823529411


In [7]:
random_forest_val_predictions = random_forest_pipeline.predict(X_val)
print(classification_report(y_val_encoded,random_forest_val_predictions))
print(accuracy_score(y_val_encoded, random_forest_val_predictions))
untuned_predictions.append(classification_report(y_val_encoded,random_forest_val_predictions))
untuned_predictions.append(accuracy_score(y_val_encoded, random_forest_val_predictions))

              precision    recall  f1-score   support

           0       0.90      0.90      0.90        50
           1       0.96      0.99      0.97        79
           2       0.91      0.86      0.89        74
           3       0.93      0.96      0.94        69

    accuracy                           0.93       272
   macro avg       0.93      0.93      0.93       272
weighted avg       0.93      0.93      0.93       272

0.9301470588235294


In [8]:
prediction = random_forest_pipeline.predict(["software engineer"])
print(prediction)
print(label_encoder.inverse_transform(prediction))

[3]
['software_development']


In [9]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

np.random.seed(random_seed)

X_train, X_test_and_val, y_train, y_test_and_val = train_test_split(job_listings_dataframe['JobTitle'],
                                                                    job_listings_dataframe['Category'],
                                                                    test_size = 0.3,
                                                                    stratify=job_listings_dataframe['Category'])

X_test, X_val, y_test, y_val = train_test_split(X_test_and_val, y_test_and_val, test_size = 0.5, stratify = y_test_and_val)

# Define the parameter grid for RandomForestClassifier
random_forest_param_grid = {
    'n_estimators': [100, 200, 300, 500, 700, 1000],
    'max_depth': [5, 10, 20, 30, 50, 70, None],
    'min_samples_split': [2, 5, 10, 15, 20],
    'min_samples_leaf': [1, 2, 4, 6, 8],
    'max_features': ['auto', 'sqrt', 'log2'],
    'bootstrap': [True, False],
    'criterion': ['gini', 'entropy']
}


random_forest_grid_search = RandomizedSearchCV(RandomForestClassifier(),
                                                param_distributions = random_forest_param_grid,
                                                n_iter = 500,
                                                cv = 5,
                                                scoring='f1_weighted',
                                                n_jobs = -4)


random_forest_pipeline = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')),
                                   ('classifier', random_forest_grid_search)])

label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)
y_val_encoded = label_encoder.transform(y_val)

random_forest_pipeline.fit(X_train, y_train_encoded)


905 fits failed out of a total of 2500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
651 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\anaxi\miniconda3\envs\webscraping\Lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\anaxi\miniconda3\envs\webscraping\Lib\site-packages\sklearn\base.py", line 1145, in wrapper
    estimator._validate_params()
  File "C:\Users\anaxi\miniconda3\envs\webscraping\Lib\site-packages\sklearn\base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\anaxi\miniconda3\envs\webscraping\Lib\site-packages\sklearn\utils\_param_validatio

In [10]:
random_forest_test_predictions = random_forest_pipeline.predict(X_test)
print(classification_report(y_test_encoded,random_forest_test_predictions))
print(accuracy_score(y_test_encoded, random_forest_test_predictions))
print(untuned_predictions[0])
print(untuned_predictions[1])

              precision    recall  f1-score   support

           0       0.96      0.90      0.93        50
           1       0.99      1.00      0.99        79
           2       0.92      0.95      0.93        73
           3       0.97      0.97      0.97        70

    accuracy                           0.96       272
   macro avg       0.96      0.95      0.96       272
weighted avg       0.96      0.96      0.96       272

0.9595588235294118
              precision    recall  f1-score   support

           0       0.96      0.86      0.91        50
           1       0.98      1.00      0.99        79
           2       0.91      0.95      0.93        73
           3       0.97      0.97      0.97        70

    accuracy                           0.95       272
   macro avg       0.95      0.94      0.95       272
weighted avg       0.95      0.95      0.95       272

0.9522058823529411


In [11]:
random_forest_val_predictions = random_forest_pipeline.predict(X_val)
print(classification_report(y_val_encoded,random_forest_val_predictions))
print(accuracy_score(y_val_encoded, random_forest_val_predictions))
print(untuned_predictions[2])
print(untuned_predictions[3])

              precision    recall  f1-score   support

           0       0.90      0.90      0.90        50
           1       0.96      0.99      0.97        79
           2       0.93      0.86      0.90        74
           3       0.93      0.97      0.95        69

    accuracy                           0.93       272
   macro avg       0.93      0.93      0.93       272
weighted avg       0.93      0.93      0.93       272

0.9338235294117647
              precision    recall  f1-score   support

           0       0.90      0.90      0.90        50
           1       0.96      0.99      0.97        79
           2       0.91      0.86      0.89        74
           3       0.93      0.96      0.94        69

    accuracy                           0.93       272
   macro avg       0.93      0.93      0.93       272
weighted avg       0.93      0.93      0.93       272

0.9301470588235294


In [12]:

# Encode the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(job_listings_dataframe['Category'])

# Fit the pipeline on the full dataset
random_forest_pipeline.fit(job_listings_dataframe['JobTitle'], y_encoded)

# At this point, your pipeline is trained on the entire dataset


875 fits failed out of a total of 2500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
340 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\anaxi\miniconda3\envs\webscraping\Lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\anaxi\miniconda3\envs\webscraping\Lib\site-packages\sklearn\base.py", line 1145, in wrapper
    estimator._validate_params()
  File "C:\Users\anaxi\miniconda3\envs\webscraping\Lib\site-packages\sklearn\base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\anaxi\miniconda3\envs\webscraping\Lib\site-packages\sklearn\utils\_param_validatio

In [13]:
from joblib import dump

# save model and encoder to file
dump(random_forest_pipeline, 'random_forest_model.joblib')
dump(label_encoder, 'random_forest_encoder.joblib')
end_time = time.time()
execution_time = end_time - start_time
print(f"Execution time: {execution_time} seconds")

Execution time: 148.55682563781738 seconds
