# Tech Job Market and Salaries Analysis 

For our final project, we have selected the Stack Overflow Developer Survey dataset, 
which contains detailed responses from developers regarding their job roles, skills, 
technologies used, and salary information. This dataset is particularly relevant to the 
tech industry, which is a major focus of our group, and will provide insights into the tech 
job market by collecting responses from developers worldwide. It covers various topics 
such as job roles, salary, coding activities, education, technology usage, and job 
satisfaction.<br>

Team Eyy<br>
Members:  
- Julianne Kristine D. Aban 
- Derich Andre G. Arcilla 
- Jennifer Bendoy 
- Richelle Ann C. Candidato 
- Marc Francis B. Gomolon 
- Phoebe Kae A. Plasus

##### Data Preparation

LOADING DATA SET & LIBRARIES

In [None]:
import pandas as pd
import numpy as np

# Load the dataset
# df = pd.read_csv('survey_results_filtered.csv')
df = pd.read_csv('survey_results_public.csv')
df.head()

In [None]:
# Expand display settings to show all columns
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.max_rows', 200)     # Adjust rows if needed


In [None]:
# Display column information: name, number of missing values, and dtype
column_info = pd.DataFrame({
    'Column Name': df.columns,
    'Missing Values': df.isnull().sum(),
    'Data Type': df.dtypes
}).reset_index(drop=True)

# Print the column information
print(column_info)

In [None]:
# Calculate the percentage of missing values
missing_percentage = (df.isnull().sum() / len(df)) * 100

# Filter columns with more than 50% missing values
high_missing_cols = missing_percentage[missing_percentage > 50]
print("Columns with more than 50% missing values:")
print(high_missing_cols)


In [None]:
# Drop columns with more than 50% missing values
df_cleaned = df.drop(columns=high_missing_cols.index)
print(f"Dataset shape after dropping columns: {df_cleaned.shape}")

# Show the names of the remaining columns
remaining_columns = df_cleaned.columns
print(f"Remaining columns ({len(remaining_columns)}):")
print(remaining_columns)


In [None]:
# Fill missing numerical values with median
numerical_cols = df_cleaned.select_dtypes(include=['float64', 'int64']).columns
df_cleaned[numerical_cols] = df_cleaned[numerical_cols].fillna(df_cleaned[numerical_cols].median())

# Fill missing categorical values with mode
categorical_cols = df_cleaned.select_dtypes(include=['object']).columns
df_cleaned[categorical_cols] = df_cleaned[categorical_cols].fillna(df_cleaned[categorical_cols].mode().iloc[0])

# Check for missing values in numerical columns
print("Missing values in numerical columns:")
print(df_cleaned[numerical_cols].isnull().sum())

# Check for missing values in categorical columns
print("Missing values in categorical columns:")
print(df_cleaned[categorical_cols].isnull().sum())



In [None]:
#Save Cleaned File
df_cleaned.to_csv('cleaned_survey_results.csv', index=False)


Apriori Algorithm


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from mlxtend.frequent_patterns import apriori, association_rules

# Load the cleaned dataset
df_cleaned = pd.read_csv('cleaned_survey_results.csv')

# Define the columns to process (update based on your data)
columns_to_encode = [
    'LanguageHaveWorkedWith',
    'DatabaseHaveWorkedWith',
    'WebframeHaveWorkedWith',
    'ToolsTechHaveWorkedWith',
    'DevType'
]

# Create a binary matrix
binary_df = pd.DataFrame()

for col in columns_to_encode:
    if col in df_cleaned.columns:
        # Split semi-colon-separated values and create a binary matrix
        split_data = df_cleaned[col].str.get_dummies(sep=';')
        binary_df = pd.concat([binary_df, split_data], axis=1)

# Convert the binary matrix to bool type
binary_df_bool = binary_df.astype(bool)

# Apply the Apriori algorithm using the bool DataFrame
frequent_itemsets = apriori(binary_df_bool, min_support=0.05, use_colnames=True)

# Calculate the total number of itemsets
num_itemsets = len(frequent_itemsets)

# Generate association rules, including the 'num_itemsets' parameter
rules = association_rules(frequent_itemsets, num_itemsets=num_itemsets, metric="lift", min_threshold=1.0)

# This code identifies patterns in how developers use technologies like programming languages, databases, and frameworks. 
# By applying the Apriori algorithm, it reveals frequent combinations (e.g., Python with SQL) and strong associations between tools, 
# helping understand how technologies are commonly grouped in real-world usage.

In [None]:
# Sort and display the top rules
rules = rules.sort_values(by='lift', ascending=False)
top_rules = rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].head(10)

# Conditional formatting with darker gold and softer pink, and black text for gold background
def highlight_strong_tool_associations(val):
    color = ''
    text_color = 'color: black'  # Default text color for gold background
    if isinstance(val, (int, float)):  # Apply formatting only to numerical values
        if val > 0.7:  # For high values
            color = 'background-color: #D4AF37'  # Darker gold for high values
            text_color = 'color: black'  # Ensure text is black on gold
        elif val < 0.3:  # For low values
            color = 'background-color: maroon'  # Softer pink for low values
    return color + ';' + text_color

# Apply conditional formatting to the table
styled_table = top_rules.style.applymap(highlight_strong_tool_associations, subset=['support', 'confidence', 'lift'])

# Display the styled table
styled_table


In [None]:
# Columns to analyze
employment_columns = ['Employment', 'RemoteWork', 'OrgSize']
tech_columns = [
    'LanguageHaveWorkedWith', 'DatabaseHaveWorkedWith',
    'WebframeHaveWorkedWith', 'ToolsTechHaveWorkedWith'
]

# Convert binary-encoded dataframes to boolean
binary_employment = pd.get_dummies(df_cleaned[employment_columns], prefix=employment_columns).astype(bool)
binary_tech = pd.DataFrame()

for col in tech_columns:
    if col in df_cleaned.columns:
        split_data = df_cleaned[col].str.get_dummies(sep=';').astype(bool)
        binary_tech = pd.concat([binary_tech, split_data], axis=1)


# Combine employment and tech binary data
binary_data = pd.concat([binary_employment, binary_tech], axis=1)

# Apply Apriori algorithm
frequent_itemsets = apriori(binary_data, min_support=0.05, use_colnames=True)

# Generate association rules
rules = association_rules(frequent_itemsets, num_itemsets=num_itemsets, metric="lift", min_threshold=1.0)

# Filter and sort the rules
rules = rules.sort_values(by='lift', ascending=False)
print("Top 10 association rules for Employment and Technology:")
print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].head(10))


# This code explores relationships between employment factors (e.g., job type, remote work) and technology preferences. 
# It highlights how professional roles influence technology choices, such as remote workers preferring tools like Docker, 
# offering insights into technology trends based on work environments.

In [None]:
# Filter and sort the rules
rules = rules.sort_values(by='lift', ascending=False)
top_rules = rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].head(10)

# Conditional formatting with darker gold and maroon, and black text for gold background
def highlight_employment_technology_associations(val):
    color = ''
    text_color = 'color: black'  # Default text color for gold background
    if isinstance(val, (int, float)):  # Apply formatting only to numerical values
        if val > 0.7:  # For high values
            color = 'background-color: #D4AF37'  # Darker gold for high values
            text_color = 'color: black'  # Ensure text is black on gold
        elif val < 0.3:  # For low values
            color = 'background-color: maroon'  # Softer maroon for low values
    return color + ';' + text_color

# Apply conditional formatting to the table
styled_table = top_rules.style.applymap(highlight_employment_technology_associations, subset=['support', 'confidence', 'lift'])

# Display the styled table
styled_table
