In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import base64
from scipy import stats
from io import BytesIO

from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import NearestNeighbors

from flask import Flask, request, render_template_string

plt.style.use('ggplot')

In [2]:
# Load data
job_skills = pd.read_csv("job_skills.csv")
skills = pd.read_csv("skills.csv")
posting = pd.read_csv("postings.csv")

# Data Manipulation

In [3]:
# Merge posting with job_skills on 'job_id'
posting_job_skills = pd.merge(posting, job_skills, on='job_id', how='inner')

# Now merge the result with skills to get full skill names using 'skill_abr'
df = pd.merge(posting_job_skills, skills, on='skill_abr', how='inner')

# Drop rows with missing values in the specified columns
columns_to_check = ['max_salary', 'pay_period']
df.dropna(subset = columns_to_check, inplace = True)

# Drop columns that are no longer needed
columns_to_drop = ['med_salary','applies','remote_allowed', 'application_url', 
                   'closed_time', 'skills_desc','posting_domain', 'sponsored','zip_code', 'fips']
df.drop(columns=columns_to_drop, inplace=True)

#  Impute 'company_name' with the most frequent  value
df['company_name'] = df['company_name'].fillna(df['company_name'].mode().iloc[0])

# Impute 'description' with a placeholder string
df['description'] = df['description'].fillna("No description available")

# Impute 'company_id' with the most frequent value
df['company_id'] = df['company_id'].fillna(df['company_id'].mode().iloc[0])

# Impute 'views' with the median
df['views'] = df['views'].fillna(df['views'].median())

# Impute 'formatted_experience_level' with the most frequent  value
df['formatted_experience_level'] = df['formatted_experience_level'].fillna(df['formatted_experience_level'].mode().iloc[0])

# Identify rows where the pay_period contains any form of 'hourly'
hourly_mask = df['pay_period'].str.contains(r'HOURLY|hourly|Hourly', case=False, na=False)

# Convert max_salary and min_salary to yearly salaries if pay_period is hourly
df.loc[hourly_mask, 'max_salary'] = df.loc[hourly_mask, 'max_salary'] * 2080
df.loc[hourly_mask, 'min_salary'] = df.loc[hourly_mask, 'min_salary'] * 2080

# Update the pay_period to 'YEARLY' for the rows where the pay_period was Hourly
df.loc[hourly_mask, 'pay_period'] = 'YEARLY'

# Identify rows where the pay_period is 'MONTHLY', 'WEEKLY', or 'BIWEEKLY'
monthly_mask = df['pay_period'].str.contains(r'MONTHLY', case=False, na=False)
weekly_mask = df['pay_period'].str.contains(r'WEEKLY', case=False, na=False)
biweekly_mask = df['pay_period'].str.contains(r'BIWEEKLY', case=False, na=False)

# Convert max_salary and min_salary to yearly salaries for the corresponding pay periods
# For monthly (12 months per year)
df.loc[monthly_mask, 'max_salary'] = df.loc[monthly_mask, 'max_salary'] * 12
df.loc[monthly_mask, 'min_salary'] = df.loc[monthly_mask, 'min_salary'] * 12
df.loc[monthly_mask, 'pay_period'] = 'YEARLY'

# For weekly (52 weeks per year)
df.loc[weekly_mask, 'max_salary'] = df.loc[weekly_mask, 'max_salary'] * 52
df.loc[weekly_mask, 'min_salary'] = df.loc[weekly_mask, 'min_salary'] * 52
df.loc[weekly_mask, 'pay_period'] = 'YEARLY'

# For biweekly (26 biweeks per year)
df.loc[biweekly_mask, 'max_salary'] = df.loc[biweekly_mask, 'max_salary'] * 26
df.loc[biweekly_mask, 'min_salary'] = df.loc[biweekly_mask, 'min_salary'] * 26
df.loc[biweekly_mask, 'pay_period'] = 'YEARLY'

# Select numerical columns, but exclude some  column from outlier capping
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns

#  Remove some columns from the numerical columns for outlier capping
numerical_cols = [col for col in numerical_cols if col not in ['job_id', 'views' ]]

# Calculate IQR for each numerical column
Q1 = df[numerical_cols].quantile(0.25)
Q3 = df[numerical_cols].quantile(0.75)
IQR = Q3 - Q1

# Cap outliers at the 1.5*IQR threshold (Winsorization)
for col in numerical_cols:
    df[col] = df[col].clip(lower=Q1[col] - 1.5 * IQR[col], upper=Q3[col] + 1.5 * IQR[col])
    
# Create new variable based on existing variables
df["total_time_listed"] = df["expiry"] - df["original_listed_time"]

# Create indicator variable to be whether or not greater than or equal to 3rd quartile
views_q3 = df["views"].quantile(0.75)
df["views_ind"] = np.where(df["views"] >= views_q3, 1, 0)

# Job Popularity

## Decision Tree Classifier and Random Forest Classifier

In [4]:
# Select variables
df_selected = df[["company_name", 
                  "title", 
                  "max_salary",
                  "min_salary",
                  "normalized_salary",
                  "location", 
                  "formatted_work_type", 
                  "application_type", 
                  "formatted_experience_level",
                  "skill_name",
                  "total_time_listed"]]

# Encode categorical variables
label_encoder = LabelEncoder()

object_cols = df_selected.select_dtypes(include = ['object']).columns.tolist()
for col in object_cols:
    df_selected.loc[:, col] = label_encoder.fit_transform(df_selected[col])
    
# Select values for X and y
X = df_selected.values
y = df["views_ind"].values

# Separate train and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 21)

# Decision Tree Classifier
dt_classifier = DecisionTreeClassifier(criterion = "entropy", random_state = 21)
dt_classifier.fit(X_train, y_train)
y_pred = dt_classifier.predict(X_test)

# Random Forest Classifier
rf_classifier = RandomForestClassifier(random_state = 21)
rf_classifier.fit(X_train, y_train)
y_pred2 = rf_classifier.predict(X_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


# Predicting Salary

## Random Forest Regression

In [5]:
# Features and Target
X_reg = df[['company_name', 'title', 'location', 'work_type', 'formatted_experience_level', 
            'views', 'description', 'original_listed_time', 'expiry']]
y_reg = df['normalized_salary']

# Encoding the categorical features with labels
categorical_columns = ['company_name', 'title', 'location', 'work_type', 
                       'formatted_experience_level', 'description']

encoders = {}  # Store encoders for later in code

for i in categorical_columns:
    label_e = LabelEncoder()
    X_reg.loc[:, i] = label_e.fit_transform(X_reg[i])
    encoders[i] = label_e  # Save the encoder for potential reverse transformation

# Training 80 / Testing 20
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)

# Random Forest Regressor
random_forest = RandomForestRegressor(n_estimators=100, random_state=42)

# Training phase (fit model here)
random_forest.fit(X_train_reg, y_train_reg)

# predict (y_pred)
y_pred_reg = random_forest.predict(X_test_reg)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


## Linear Regression

In [6]:
# Features and Target
X_reg2 = df[['company_name', 'title', 'location', 'work_type', 'formatted_experience_level', 'views']]
y_reg2 = df['normalized_salary']  # our target

# Separate cat and num variables we're using
categorical_features = ['company_name', 'title', 'location', 'work_type', 'formatted_experience_level']
numerical_features = ['views']

# cat pipeline
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
# num pipeline
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Fill missing numerical values with mean
    ('scaler', StandardScaler())  # Scale numerical features
])

# Combined cat and num pipelines
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features),
        ('num', numerical_transformer, numerical_features)
    ])

# Linear Regression step
LR_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Training 80 / Testing 20
X_train_reg2, X_test_reg2, y_train_reg2, y_test_reg2 = train_test_split(X_reg2, y_reg2, 
                                                                        test_size=0.2, random_state=42)

# parameters; just used fit intercept for now but we can always add more for variations
param_grid = {
    'regressor__fit_intercept': [True, False]  # Check with and without intercept
}

# GridSearchCV
grid_search = GridSearchCV(LR_model, param_grid, cv=5, n_jobs=-1, scoring='neg_mean_squared_error')

# Fit the model
grid_search.fit(X_train_reg2, y_train_reg2)

# Best model / parameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

y_pred_reg2 = best_model.predict(X_test_reg2)

# App

## Functions

### General

In [7]:
def save_plot():
    # Save plot to a BytesIO object and encode it in base64
    buf = BytesIO()
    plt.tight_layout()
    plt.savefig(buf, format="png")
    buf.seek(0)
    output = base64.b64encode(buf.getvalue()).decode('utf-8')
    
    return output

### EDA

In [8]:
def plot_salary_boxplots(df):
    plt.figure(figsize=(10, 6))

    # Create a box plot for the three salary columns
    sns.boxplot(data=df[['max_salary', 'min_salary', 'normalized_salary']])

    # Set the title and labels
    plt.title('Box Plot of Max Salary, Min Salary, and Normalized Salary')
    plt.ylabel('Salary')
    
    # Save plot using predefined function
    output = save_plot()
    
    return output

In [9]:
def plot_histograms(df):
    # Get numerical columns
    numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns

    # Calculate the number of rows and columns for the subplots
    n_cols = 3
    n_rows = (len(numerical_cols) // n_cols) + (len(numerical_cols) % n_cols > 0)

    # Set up the plot grid
    plt.figure(figsize=(15, 5 * n_rows))

    # Plot histograms for each numerical column
    for i, col in enumerate(numerical_cols, 1):
        plt.subplot(n_rows, n_cols, i)
        sns.histplot(df[col], kde=True, bins=30)
        plt.title(f'Distribution of {col}')

    # Save plot using predefined function
    output = save_plot()
    
    return output

In [10]:
def plot_corr_matrix(df):
    plt.figure(figsize=(12, 8))
    
    # Correlation Matrix for numerical columns
    corr = df[numerical_cols].corr()
    
    sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
    plt.title("Correlation Matrix of Numerical Features")
    
    # Save plot using predefined function
    output = save_plot()
    
    return output

### Job Popularity

In [11]:
def plot_confusion_matrices(y_test, y_pred, y_pred2):
    fig, ax = plt.subplots(1, 2, figsize = (12, 8))
    
    # Plot two confusion matrices side by side for comparison
    sns.heatmap(confusion_matrix(y_test, y_pred), 
                annot = True, fmt = 'd', cmap = 'Reds', cbar = False, ax = ax[0])
    ax[0].set_title("Decision Tree Classifier Confusion Matrix")

    sns.heatmap(confusion_matrix(y_test, y_pred2), 
                annot = True, fmt = 'd', cmap = 'Blues', cbar = False, ax = ax[1])
    ax[1].set_title("Random Forest Classifier Confusion Matrix")
    
    # Save plot using predefined function
    output = save_plot()
    
    return output

In [12]:
def plot_feature_importance(dt_classifier, rf_classifier, df_selected):
    fig, ax = plt.subplots(2, 1, figsize=(12, 8))

    # Decision Tree Feature Importance
    ax[0].barh(df_selected.columns, dt_classifier.feature_importances_, color="red")
    ax[0].set_title("Decision Tree Classifier Feature Importance")

    # Random Forest Feature Importance
    ax[1].barh(df_selected.columns, rf_classifier.feature_importances_, color="blue")
    ax[1].set_title("Random Forest Classifier Feature Importance")

    # Save plot using predefined function
    output = save_plot()
    
    return output

### Predicting Salary

In [13]:
def plot_feature_importance_2(random_forest, X_reg):
    plt.figure(figsize=(10, 8))

    # Random Forest Feature Importance 
    plt.barh(X_reg.columns, random_forest.feature_importances_, color = "green")
    plt.title("Random Forest Regression Feature Importance")

    # Save plot using predefined function
    output = save_plot()
    
    return output

In [14]:
def get_personalized_recommendations(user_preferences, df):
    """
    Generate more accurate personalized job recommendations
    
    Parameters:
    - user_preferences (dict): Dictionary containing user's job search preferences
    - df (pandas.DataFrame): Original job postings dataframe
    
    Returns:
    - list of recommended job postings
    """
    # Create a copy to avoid modifying the original dataframe
    recommendation_df = df.copy()
    
    # Create a scoring mechanism for recommendations
    def calculate_recommendation_score(row, preferences):
        score = 0
        
        # Title similarity (more weight)
        if 'title' in preferences and preferences['title']:
            # Case-insensitive partial match with higher weight
            if preferences['title'].lower() in row['title'].lower():
                score += 50
            elif any(word.lower() in row['title'].lower() for word in preferences['title'].lower().split()):
                score += 30
        
        # Location similarity
        if 'location' in preferences and preferences['location']:
            if preferences['location'].lower() in row['location'].lower():
                score += 20
            elif any(word.lower() in row['location'].lower() for word in preferences['location'].lower().split()):
                score += 10
        
        # Work type matching
        if 'work_type' in preferences and preferences['work_type']:
            if preferences['work_type'].lower() == row['formatted_work_type'].lower():
                score += 15
        
        # Salary range consideration
        if 'min_salary' in preferences and preferences['min_salary']:
            if row['max_salary'] >= preferences['min_salary']:
                # Closer to desired salary gets more points
                salary_proximity = 1 - abs(row['max_salary'] - preferences['min_salary']) / preferences['min_salary']
                score += 10 * salary_proximity
        
        return score
    
    # Apply scoring mechanism
    recommendation_df['recommendation_score'] = recommendation_df.apply(
        lambda row: calculate_recommendation_score(row, user_preferences), 
        axis=1
    )
    
    # Sort by recommendation score in descending order
    recommended_jobs = recommendation_df.sort_values('recommendation_score', ascending=False)
    
    # Filter out jobs with zero score and take top 5
    recommended_jobs = recommended_jobs[recommended_jobs['recommendation_score'] > 0].head(5)
    
    return recommended_jobs.to_dict('records')

## Implementation

In [None]:
app = Flask(__name__)

@app.route('/')
def home():
    return """
    <html>
        <head><title>Job Market Insights Web Application</title></head>
        <body>
            <h1>Welcome to the Job Market Insights Web Application!</h1>
            <p><a href="/recommendations">Get Personalized Job Recommendations</a></p>
            <p><a href="/eda">View EDA Results</a></p>
            <p><a href="/job_popularity">View Job Popularity Results</a></p>
            <p><a href="/predicting_salary">View Predicting Salary Results</a></p>  
        </body>
    </html>
    """

@app.route('/eda')
def eda():
    # Get Salary Boxplots
    salary_boxplots_image = plot_salary_boxplots(df)
    
    # Get Histograms
    histograms_image = plot_histograms(df)
    
    # Get Correlation Matrix
    corr_matrix_image = plot_corr_matrix(df)
    
    return f"""
    <html>
        <head><title>EDA</title></head>
        <body>
            <h1>EDA</h1>
            <h2>Salary Box Plots After Handling Outliers</h2>
            <img src="data:image/png;base64,{salary_boxplots_image}" />
            
            <h2>Histograms</h2>
            <img src="data:image/png;base64,{histograms_image}" />
            
            <h2>Correlation Matrix</h2>
            <img src="data:image/png;base64,{corr_matrix_image}" />
        </body>
    </html>
    """

@app.route('/job_popularity')
def job_popularity():
    # Get F1 Scores
    f1_dt = f1_score(y_test, y_pred).round(5)
    f1_rf = f1_score(y_test, y_pred2).round(5)
    
    # Get Classification Reports
    report_dt = classification_report(y_test, y_pred)
    report_rf = classification_report(y_test, y_pred2)
    
    # Get Confusion Matrices
    confusion_matrices_image = plot_confusion_matrices(y_test, y_pred, y_pred2)
    
    # Get the Feature Importance Plot
    feature_importance_image = plot_feature_importance(dt_classifier, rf_classifier, df_selected)
    
    return f"""
    <html>
        <head><title>Job Popularity Results</title></head>
        <body>
            <h1>Job Popularity Results</h1>
            <h2>F1 Score Comparison</h2>
            <p>Decision Tree F1 Score: {f1_dt}</p>
            <p>Random Forest F1 Score: {f1_rf}</p>
            
            <h2>Decision Tree Classification Report</h2>
            <pre>{report_dt}</pre>
            
            <h2>Random Forest Classification Report</h2>
            <pre>{report_rf}</pre>
            
            <h2>Confusion Matrices Comparison</h2>
            <img src="data:image/png;base64,{confusion_matrices_image}" />
            
            <h2>Feature Importance Comparison</h2>
            <img src="data:image/png;base64,{feature_importance_image}" />
        </body>
    </html>
    """

@app.route('/predicting_salary')
def predicting_salary():
    # Get Mean Squared Error and R-squared for both regressor models
    mse = mean_squared_error(y_test_reg, y_pred_reg).round(2)
    r2 = r2_score(y_test_reg, y_pred_reg).round(5)
    
    mse_2 = mean_squared_error(y_test_reg2, y_pred_reg2).round(2)
    r2_2 = r2_score(y_test_reg2, y_pred_reg2).round(5)
    
    # Get the Feature Importance Plot
    feature_importance_image_2 = plot_feature_importance_2(random_forest, X_reg)
    
    return f"""
    <html>
        <head><title>Predicting Salary Results</title></head>
        <body>
            <h1>Predicting Salary Results</h1>
            
            <h2>Random Forest Regression Results</h2>
            
            <h3>Metrics</h3>
            <p>MSE: {mse}</p>
            <p>R-squared: {r2}</p>
            
            <h3>Feature Importance</h3>
            <img src="data:image/png;base64,{feature_importance_image_2}" />
            
            <h2>Linear Regression Results</h2>
            
            <h3>Metrics</h3>
            <p>MSE: {mse_2}</p>
            <p>R-squared: {r2_2}</p>
        </body>
    </html>
    """

@app.route('/recommendations', methods=['GET', 'POST'])
def recommendations():
    if request.method == 'POST':
        # Collect user preferences from form
        user_preferences = {
            'title': request.form.get('title', ''),
            'location': request.form.get('location', ''),
            'work_type': request.form.get('work_type', ''),
            'min_salary': float(request.form.get('min_salary', 0))
        }
        
        # Get personalized recommendations
        recommendations = get_personalized_recommendations(user_preferences, df)
        
        # Render recommendations
        recommendations_html = """
        <html>
        <head>
            <title>Personalized Job Recommendations</title>
            <style>
                .job-recommendation {
                    border: 1px solid #ddd;
                    margin: 10px 0;
                    padding: 10px;
                    border-radius: 5px;
                }
                .job-recommendation h3 {
                    margin-top: 0;
                    color: #333;
                }
            </style>
        </head>
        <body>
            <h2>Personalized Job Recommendations</h2>
        """
        
        if not recommendations:
            recommendations_html += "<p>No recommendations found. Try broadening your search criteria.</p>"
        else:
            for job in recommendations:
                recommendations_html += f"""
                <div class='job-recommendation'>
                    <h3>{job['title']}</h3>
                    <p><strong>Company:</strong> {job['company_name']}</p>
                    <p><strong>Location:</strong> {job['location']}</p>
                    <p><strong>Salary Range:</strong> ${job['min_salary']:,.0f} - ${job['max_salary']:,.0f}</p>
                    <p><strong>Work Type:</strong> {job['formatted_work_type']}</p>
                </div>
                """
        
        recommendations_html += """
            <br>
            <a href="/recommendations">Back to Search</a>
        </body>
        </html>
        """
        
        return recommendations_html
    
    # GET request - show recommendation form
    return """
    <html>
        <head><title>Job Recommendations</title></head>
        <body>
            <h1>Get Personalized Job Recommendations</h1>
            <form method='POST'>
                <label>Job Title:</label>
                <input type='text' name='title' placeholder='e.g., Software Engineer'><br>
                
                <label>Location:</label>
                <input type='text' name='location' placeholder='e.g., New York'><br>
                
                <label>Work Type:</label>
                <input type='text' name='work_type' placeholder='e.g., Full-time'><br>
                
                <label>Minimum Salary:</label>
                <input type='number' name='min_salary' placeholder='$0'><br>
                
                <label>Maximum Salary:</label>
                <input type='number' name='max_salary' placeholder='Unlimited'><br>
                
                <input type='submit' value='Get Recommendations'>
            </form>
        </body>
    </html>
    """

if __name__ == "__main__":
    app.run()

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit


# References

* Kon, A. (2024, August 19). LinkedIn job postings (2023 - 2024). Kaggle. https://www.kaggle.com/datasets/arshkon/linkedin-job-postings 
* Sarkar., R. (2023, September 4). Mastering web services with Jupyter notebook and Flask: A comprehensive guide with postman... Medium. https://blog.devops.dev/mastering-web-services-with-jupyter-notebook-and-flask-a-comprehensive-guide-with-postman-15d6a2f18d62 
* Learn HTML: Elements and structure Cheatsheet. Codecademy. (n.d.). https://www.codecademy.com/learn/learn-html/modules/learn-html-elements/cheatsheet 
* Rendering matplotlib charts in Flask. (n.d.). https://blog.pamelafox.org/2023/03/rendering-matplotlib-charts-in-flask.html 
* EdGaereEdGaere (1960, January 1). Django bytesIO to base64 string & return as JSON. Stack Overflow. https://stackoverflow.com/questions/27241996/django-bytesio-to-base64-string-return-as-json 
* Chatgpt. (n.d.). https://chatgpt.com/ 