Load and Inspect Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings 
warnings.filterwarnings("ignore")

# Load the data
df = pd.read_csv('clean_kaggle_data_2024.csv')
df.info()

: 

In [None]:
df.head()

Q1

In [None]:
# Convert salary column to numeric, handling errors and missing values
df['ConvertedCompYearly'] = pd.to_numeric(df['ConvertedCompYearly'], errors='coerce')

# Plot 1: Salary distribution by Remote Work Status
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x="RemoteWork", y="ConvertedCompYearly", palette="coolwarm", showfliers=False)
plt.yscale("log")  # Log scale for better visualization
plt.title("Salary Distribution by Remote Work Status")
plt.xlabel("Remote Work Type")
plt.ylabel("Salary (log scale)")
plt.xticks(rotation=45)
plt.grid(True, which="both", linestyle="--", linewidth=0.5)
plt.show()


In [None]:
# Select top 5 countries by number of respondents for better visualization
top_countries = df['Country'].value_counts().nlargest(5).index
df_top_countries = df[df['Country'].isin(top_countries)]

# Plot 2: Salary distribution by country
plt.figure(figsize=(12, 6))
sns.boxplot(data=df_top_countries, x="Country", y="ConvertedCompYearly", palette="viridis")
plt.yscale("log")  # Log scale for better visualization
plt.title("Salary Distribution by Country")
plt.xlabel("Country")
plt.ylabel("Salary (log scale)")
plt.xticks(rotation=45)
plt.grid(True, which="both", linestyle="--", linewidth=0.5)
plt.show()


In [None]:
# Select top education levels for better visualization
top_edu_levels = df['EdLevel'].value_counts().nlargest(5).index
df_top_edu = df[df['EdLevel'].isin(top_edu_levels)]

# Plot 3: Education Level vs. Salary
plt.figure(figsize=(12, 6))
sns.boxplot(data=df_top_edu, x="EdLevel", y="ConvertedCompYearly", palette="magma")
plt.yscale("log")  # Log scale for better visualization
plt.title("Salary Distribution by Education Level")
plt.xlabel("Education Level")
plt.ylabel("Salary (log scale)")
plt.xticks(rotation=45)
plt.grid(True, which="both", linestyle="--", linewidth=0.5)
plt.show()


Q2

a. use Intterquartile Range method to define the outliers and filter out the outliers

In [None]:
# Filter for Remote and Hybrid workers
df_filtered = df[df['RemoteWork'].isin(['Remote', 'Hybrid'])].copy()

# Define a function to detect outliers
def find_outliers(group):
    q1 = group['ConvertedCompYearly'].quantile(0.25)
    q3 = group['ConvertedCompYearly'].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    return group[(group['ConvertedCompYearly'] < lower_bound) | (group['ConvertedCompYearly'] > upper_bound)]

# Find outliers for Remote and Hybrid groups
outliers_remote = find_outliers(df_filtered[df_filtered['RemoteWork'] == 'Remote'])
outliers_hybrid = find_outliers(df_filtered[df_filtered['RemoteWork'] == 'Hybrid'])

# Combine results
outliers = pd.concat([outliers_remote, outliers_hybrid])

In [None]:
# Display outlier records
outliers[['Country', 'RemoteWork', 'ConvertedCompYearly', 'DevType', 'YearsCodePro']].sort_values(
    by='ConvertedCompYearly', ascending=False
).head(10)

In [None]:
import pandas as pd
import numpy as np
from scipy import stats

# Load the dataset
file_path = "path_to_your_file.csv"
df = pd.read_csv(file_path)

# Convert salary column to numeric (handling errors and missing values)
df['ConvertedCompYearly'] = pd.to_numeric(df['ConvertedCompYearly'], errors='coerce')

# Drop missing values in salary and remote work status
df = df.dropna(subset=['RemoteWork', 'ConvertedCompYearly'])

# Compute Q1, Q3, and IQR
Q1 = df['ConvertedCompYearly'].quantile(0.25)
Q3 = df['ConvertedCompYearly'].quantile(0.75)
IQR = Q3 - Q1

# Define lower and upper bounds for outliers
lower_bound = max(0, Q1 - 1.5 * IQR)  # Salary cannot be negative
upper_bound = Q3 + 1.5 * IQR

# Create two datasets: one with outliers and one without
df_no_outliers = df[(df['ConvertedCompYearly'] >= lower_bound) & (df['ConvertedCompYearly'] <= upper_bound)]
df_with_outliers = df.copy()  # Keep original data without filtering

# Extract salary data for remote and hybrid workers (without removing outliers)
remote_salaries_all = df_with_outliers[df_with_outliers["RemoteWork"] == "Remote"]["ConvertedCompYearly"]
hybrid_salaries_all = df_with_outliers[df_with_outliers["RemoteWork"] == "Hybrid"]["ConvertedCompYearly"]

# Extract salary data for remote and hybrid workers (with IQR-based outlier removal)
remote_salaries_iqr = df_no_outliers[df_no_outliers["RemoteWork"] == "Remote"]["ConvertedCompYearly"]
hybrid_salaries_iqr = df_no_outliers[df_no_outliers["RemoteWork"] == "Hybrid"]["ConvertedCompYearly"]
def manual_t_test(group1, group2):
    # Compute sample means
    mean1, mean2 = np.mean(group1), np.mean(group2)
    
    # Compute sample variances (unbiased estimator: ddof=1)
    var1, var2 = np.var(group1, ddof=1), np.var(group2, ddof=1)
    
    # Compute sample sizes
    n1, n2 = len(group1), len(group2)
    
    # Compute pooled standard deviation
    pooled_std = np.sqrt(((n1 - 1) * var1 + (n2 - 1) * var2) / (n1 + n2 - 2))
    
    # Compute standard error of the mean difference
    se_diff = pooled_std * np.sqrt(1 / n1 + 1 / n2)
    
    # Compute t-statistic
    t_stat = (mean1 - mean2) / se_diff
    
    # Compute degrees of freedom
    df = n1 + n2 - 2
    
    return t_stat, df

# Perform manual t-test for both cases
t_stat_all, df_all = manual_t_test(remote_salaries_all, hybrid_salaries_all)  # With outliers
t_stat_iqr, df_iqr = manual_t_test(remote_salaries_iqr, hybrid_salaries_iqr)  # Without outliers

# Perform two-sample t-test using scipy's built-in function
t_stat_scipy_all, p_value_scipy_all = stats.ttest_ind(remote_salaries_all, hybrid_salaries_all, equal_var=True)
t_stat_scipy_iqr, p_value_scipy_iqr = stats.ttest_ind(remote_salaries_iqr, hybrid_salaries_iqr, equal_var=True)

# Print the results
print("Manual t-test (With Outliers): t =", t_stat_all, ", df =", df_all)
print("Scipy t-test (With Outliers): t =", t_stat_scipy_all, ", p-value =", p_value_scipy_all)

print("\nManual t-test (Without Outliers - IQR): t =", t_stat_iqr, ", df =", df_iqr)
print("Scipy t-test (Without Outliers - IQR): t =", t_stat_scipy_iqr, ", p-value =", p_value_scipy_iqr)
