In [None]:
'''
 -----------------------------------------------------------
          Artificial Intelligence Workshop RUG
 -----------------------------------------------------------
            R.M. (Rolando) Gonzales Martinez
 -----------------------------------------------------------
  ~ ~ ~ ~ ~ ~ ~ ~ ~ ~    Salaries    ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
           OLS regression model and Elastic Nets
'''
import pandas as pd
df = pd.read_excel("") # <---------------------------------------------- fill here .xlsx
print(df.head())
# age: Age in years
# education: Level of education, (1) did not complete high school, (2) high school degree, (3) some college, (4) college degree, (5) postundergraduate degree
# employears: Years with current employer
# address: Years at current address
# salary: salary in thousands
# creddebt: Credit card debt in thousands
# othdebt: Other debt in thousands
# default: credit default


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Salary, years of employment, and age 
df_scatter = df[['', 'employears']] # <-------------------- fill here

sns.pairplot(df_scatter, diag_kind='hist')
plt.suptitle("Scatterplot Matrix", y=1.02)
plt.show()

In [None]:
# Ensure 'education' is treated as a categorical variable
df['education'] = df['education'].astype('category')

# Drop missing values in the relevant columns
df_box = df[['salary', 'education']]

plt.figure(figsize=(10, 6))
sns.boxplot(data=df_box, x='education', y='salary')
plt.title("Boxplot of Income by Education Level")
plt.xlabel("Education Level")
plt.ylabel("Income")
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# OLS regression
import statsmodels.api as sm
import statsmodels.formula.api as smf

# Drop missing values for relevant variables
df_reg = df[['salary', 'education', 'employears', 'age']].dropna()

# Convert 'education' to categorical if it's not already
df_reg['education'] = df_reg['education'].astype('category')

# Fit the regression model using statsmodels
model = smf.ols('salary ~ C(education) + ... + ... ', data=df_reg).fit() # <-------------------- fill here

# Display the summary
model_summary = model.summary()
model_summary_text = model_summary.as_text()

# Show the regression summary as plain text
print(model_summary_text)


In [None]:
# ---------------------------------------------------
#    OLS with Machine learning logic (train/test)
# ---------------------------------------------------
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.formula.api as smf  # Added import for statsmodels formula API

# Drop missing values
df_reg = df[['salary', 'education', 'employears', 'age']].dropna()

# Convert 'education' to categorical
df_reg['education'] = df_reg['education'].astype('category')

# Split into train and test sets
train_df, test_df = train_test_split(df_reg, test_size= , random_state=666) # <-------------------- fill here

# Fit the model on the training data
model = smf.ols('salary ~ C(education) + employears + age', data=train_df).fit()

# Predict on the test set
test_df['predicted_salary'] = model.predict(test_df)

# Evaluate model
rmse = np.sqrt(mean_squared_error(test_df['salary'], test_df['predicted_salary']))
r2 = r2_score(test_df['salary'], test_df['predicted_salary'])

# Output
print(model.summary())
print(f"\nTest RMSE: {rmse:.2f}")
print(f"Test R²: {r2:.3f}")


In [None]:
# ---------------------------------------------------
#       Elastic nets regression for salaries
# ---------------------------------------------------
from sklearn.linear_model import ElasticNet

# Drop missing values
df_reg = df[['salary', 'education', 'employears', 'age']].dropna()

# Convert categorical variable to dummies
df_reg = pd.get_dummies(df_reg, columns=['education'], drop_first=True)

# Split features and target
X = df_reg.drop(columns='salary')
y = df_reg['salary']

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=) # <-------------------- fill here

# Fit ElasticNet model
# Ridge: l1_ratio = 0
# Lasso: l1_ratio = 1
model = ElasticNet(alpha=0.1, l1_ratio=, random_state=) # <-------------------- fill here
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

# Mimic statsmodels-like summary
results_df = pd.DataFrame({
    'Variable': ['Intercept'] + list(X.columns),
    'Coefficient': [model.intercept_] + list(model.coef_)
})

# Output
print("\nElasticNet Regression Results:\n")
print(results_df.to_string(index=False))
print(f"\nTest RMSE: {rmse:.2f}")
print(f"Test R²: {r2:.3f}")
