In [46]:
pip install -r requirements.txt




In [57]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from statsmodels.stats.outliers_influence import variance_inflation_factor as VIF
from statsmodels.stats.anova import anova_lm
from ISLP import load_data
from ISLP.models import (ModelSpec as MS, summarize, poly)
import lxml

In [30]:
# Load the saved datasets from the CSV files
X_train = pd.read_csv("../data/X_train.csv", index_col=0)  # Use the first column as index
y_train = pd.read_csv("../data/y_train.csv", index_col=0)  # Use the first column as index
X_test = pd.read_csv("../data/X_test.csv", index_col=0)    # Use the first column as index
y_test = pd.read_csv("../data/y_test.csv", index_col=0)    # Use the first column as index

# Display the first few rows of the loaded datasets
(X_train.head(), y_train.head(), X_test.head(), y_test.head())

(   age         workclass  fnlwgt  education  education-num  \
 0   39         State-gov   77516  Bachelors             13   
 1   50  Self-emp-not-inc   83311  Bachelors             13   
 2   38           Private  215646    HS-grad              9   
 3   53           Private  234721       11th              7   
 4   28           Private  338409  Bachelors             13   
 
        marital-status         occupation   relationship   race     sex  \
 0       Never-married       Adm-clerical  Not-in-family  White    Male   
 1  Married-civ-spouse    Exec-managerial        Husband  White    Male   
 2            Divorced  Handlers-cleaners  Not-in-family  White    Male   
 3  Married-civ-spouse  Handlers-cleaners        Husband  Black    Male   
 4  Married-civ-spouse     Prof-specialty           Wife  Black  Female   
 
    capital-gain  capital-loss  hours-per-week native-country  
 0          2174             0              40  United-States  
 1             0             0          

In [31]:
# For running linear regression we need all of the features to be numbers, but currently we have:
(X_train.dtypes,X_test.dtypes)

(age                int64
 workclass         object
 fnlwgt             int64
 education         object
 education-num      int64
 marital-status    object
 occupation        object
 relationship      object
 race              object
 sex               object
 capital-gain       int64
 capital-loss       int64
 hours-per-week     int64
 native-country    object
 dtype: object,
 age                int64
 workclass         object
 fnlwgt             int64
 education         object
 education-num      int64
 marital-status    object
 occupation        object
 relationship      object
 race              object
 sex               object
 capital-gain       int64
 capital-loss       int64
 hours-per-week     int64
 native-country    object
 dtype: object)

In [32]:
# We thus want to factorize the object variables. To do this to both the test and the train data whilst ensuring the same key is used
# for both, we briefly concatinate the dataframes before splitting them back up
X_all = pd.concat({'X_train':X_train, 'X_test':X_test})

# Then we factorize
objects = ['workclass','education','marital-status','occupation','relationship','race','sex','native-country']
keys = [0]*len(objects)

for i in range(len(objects)):
    X_all[objects[i]], keys[i] = pd.factorize(X_all[objects[i]])

# Then we return the original test train split
X_train = X_all.loc['X_train']
X_test = X_all.loc['X_test']

(X_train.dtypes,X_test.dtypes)

(age               int64
 workclass         int64
 fnlwgt            int64
 education         int64
 education-num     int64
 marital-status    int64
 occupation        int64
 relationship      int64
 race              int64
 sex               int64
 capital-gain      int64
 capital-loss      int64
 hours-per-week    int64
 native-country    int64
 dtype: object,
 age               int64
 workclass         int64
 fnlwgt            int64
 education         int64
 education-num     int64
 marital-status    int64
 occupation        int64
 relationship      int64
 race              int64
 sex               int64
 capital-gain      int64
 capital-loss      int64
 hours-per-week    int64
 native-country    int64
 dtype: object)

In [33]:
# We thus want to factorize the object variables. To do this to both the test and the train data whilst ensuring the same key is used
# for both, we briefly concatinate the dataframes before splitting them back up
y_all = pd.concat({'y_train':y_train, 'y_test':y_test})

# Then we factorize
y_all['income'], income_key = pd.factorize(y_all['income'])

# Then we return the original test train split
y_train = y_all.loc['y_train']
y_test = y_all.loc['y_test']

(y_train.dtypes,y_test.dtypes)

(income    int64
 dtype: object,
 income    int64
 dtype: object)

# SciKit-Learn

In [36]:
# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(X_train,y_train)

# Make predictions using the testing set
y_pred = regr.predict(X_test)

# The coefficients
print("Coefficients: \n", regr.coef_)
# The mean squared error
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.2f" % r2_score(y_test, y_pred))

Coefficients: 
 [[ 5.94824684e-03  2.65733090e-02  5.99485588e-08  4.32411396e-03
   4.38794560e-02 -1.38287997e-02 -5.53817481e-03 -4.43602734e-03
  -1.79979025e-02 -1.33931116e-01  8.94204364e-06  1.07873453e-04
   3.95492040e-03  2.16056359e-05]]
Mean squared error: 0.14
Coefficient of determination: 0.26


# ISLP

In [58]:
model = sm.OLS(y_train, X_train)
results = model.fit()
summarize(results)

Unnamed: 0,coef,std err,t,P>|t|
age,0.0029,0.0,20.028,0.0
workclass,0.0234,0.002,14.281,0.0
fnlwgt,-1.774e-07,1.72e-08,-10.34,0.0
education,-0.0024,0.001,-4.348,0.0
education-num,0.0227,0.001,35.536,0.0
marital-status,-0.0072,0.002,-4.232,0.0
occupation,-0.0128,0.001,-21.913,0.0
relationship,-0.0248,0.001,-18.489,0.0
race,-0.0242,0.003,-7.807,0.0
sex,-0.1745,0.004,-40.094,0.0


In [59]:
?summarize

[1;31mSignature:[0m [0msummarize[0m[1;33m([0m[0mresults[0m[1;33m,[0m [0mconf_int[0m[1;33m=[0m[1;32mFalse[0m[1;33m,[0m [0mlevel[0m[1;33m=[0m[1;32mNone[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Take a fit statsmodels and summarize it
by returning the usual coefficient estimates,
their standard errors, the usual test
statistics and P-values as well as 
(optionally) confidence intervals.

Based on:

https://stackoverflow.com/questions/51734180/converting-statsmodels-summary-object-to-pandas-dataframe

Parameters
----------

results : a results object

conf_int : bool (optional)
    Include 95% confidence intervals?
[1;31mFile:[0m      c:\users\sheri\appdata\local\programs\python\python312\lib\site-packages\islp\models\__init__.py
[1;31mType:[0m      function