In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm

# Load the dataset
df = pd.read_csv(r"C:\Users\fayee\Downloads\software_engineers_gender_race_education_salary.csv")

# Strip spaces from column names (if needed)
df.columns = df.columns.str.strip()

# Part 1: Display the first few rows of the dataset
print("Dataset Overview:")
print(df.head())
print("\nDataset Summary:")
print(df.describe(include='all'))  # Summary statistics for numerical and categorical columns

# Part 2: Analyze Variance Inflation Factor (VIF)
# Create dummy variables for categorical predictors
df_encoded = pd.get_dummies(df, columns=['gender', 'Race', 'Education'], drop_first=True)

# Drop the target variable to calculate VIF
X = df_encoded.drop(columns=['totalyearlycompensation'])
vif_data = pd.DataFrame()
vif_data["Feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
print("\nVariance Inflation Factor (VIF):")
print(vif_data)

# Part 3: Build the Multiple Linear Regression Model
y = df_encoded['totalyearlycompensation']
X = sm.add_constant(X)  # Add constant for the intercept
model = sm.OLS(y, X).fit()
print("\nMultiple Linear Regression Summary:")
print(model.summary())


Dataset Overview:
  gender         Race          Education  totalyearlycompensation
0   Male        Asian                PhD                   400000
1   Male  Two Or More  Bachelor's Degree                   136000
2   Male        Asian  Bachelor's Degree                   337000
3   Male        Asian    Master's Degree                   222000
4   Male        White  Bachelor's Degree                   187000

Dataset Summary:
       gender   Race          Education  totalyearlycompensation
count   13661  13661              13661             1.366100e+04
unique      3      5                  5                      NaN
top      Male  Asian  Bachelor's Degree                      NaN
freq    11791   7642               7484                      NaN
mean      NaN    NaN                NaN             1.866836e+05
std       NaN    NaN                NaN             1.160668e+05
min       NaN    NaN                NaN             1.000000e+04
25%       NaN    NaN                NaN         

ValueError: Pandas data cast to numpy dtype of object. Check input data with np.asarray(data).

In [14]:
import pandas as pd
import statsmodels.api as sm

# Load your dataset from the CSV file
df = pd.read_csv(r"C:\Users\fayee\Downloads\software_engineers_gender_race_education_salary.csv")

# Convert categorical variables to numeric using one-hot encoding
df_encoded = pd.get_dummies(df, columns=['gender', 'Race', 'Education'], drop_first=True)

# Ensure all boolean columns are converted to integers (0 or 1)
df_encoded = df_encoded.astype(int)

# Check the columns and data types after encoding
print(f"Columns after encoding:\n{df_encoded.dtypes}")

# Separate features (X) and target (y)
X = df_encoded.drop(columns=['totalyearlycompensation'])
y = df_encoded['totalyearlycompensation']

# Add a constant to X (for the intercept in the regression model)
X = sm.add_constant(X)

# Fit the OLS model
model = sm.OLS(y, X).fit()

# Print the model summary
print(model.summary())


Columns after encoding:
totalyearlycompensation      int32
gender_Male                  int32
gender_Other                 int32
Race_Black                   int32
Race_Hispanic                int32
Race_Two Or More             int32
Race_White                   int32
Education_Highschool         int32
Education_Master's Degree    int32
Education_PhD                int32
Education_Some College       int32
dtype: object
                               OLS Regression Results                              
Dep. Variable:     totalyearlycompensation   R-squared:                       0.061
Model:                                 OLS   Adj. R-squared:                  0.060
Method:                      Least Squares   F-statistic:                     88.68
Date:                     Mon, 25 Nov 2024   Prob (F-statistic):          3.39e-178
Time:                             11:49:56   Log-Likelihood:            -1.7827e+05
No. Observations:                    13661   AIC:                        