In [None]:
# ==================================================
# ============= 03 Linear Regression 1 =============
# ==================================================

# Load the necessary functionality packages
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import statsmodels.api as sm

# Import data
ToyotaCorolla_df = pd.read_csv('C:\\Users\\hgaop\\Documents\\hgao2_OU\\Courses\\MIS-4560_5560 Data analytics\\Textbook\\Python_R-dmba-datasets\\dmba\\ToyotaCorolla.csv')  
# REMEMBER to check and update the file path every time you import data
# NOTE: use '\\' to replace '\' in the file path

In [None]:
# Explore the Data Frame (the basics)

## Check the dimension of the data frame
ToyotaCorolla_df.shape

In [None]:
## Look at the first 5 rows of the data frame
ToyotaCorolla_df.head()

In [None]:
## Print the column names
print(ToyotaCorolla_df.columns.tolist())

In [None]:
## Check the unique values in the 'Fuel_Type' column
unique_fuel_types = ToyotaCorolla_df['Fuel_Type'].unique()

## Print the unique values
print(unique_fuel_types)

In [None]:
## Check for missing values in the ToyotaCorolla_df DataFrame
missing_values = ToyotaCorolla_df.isnull().sum()

## Print the count of missing values for each column
print(missing_values)

In [None]:
# Fit a Linear Regression Model to the Training Data

## Reduce the data frame to include the first 1000 rows and columns of interest (define the predictors and outcome variables)
ToyotaCorolla_df = ToyotaCorolla_df.iloc[0:1000]
predictors = ['Age_08_04', 'KM', 'Fuel_Type', 'HP', 'Met_Color', 'Automatic', 'CC', 'Doors', 'Quarterly_Tax', 'Weight']
outcome = 'Price'

In [None]:
## Define the predictor and outcome variables
X = pd.get_dummies(ToyotaCorolla_df[predictors], drop_first=False) # Convert non-numerical categorical variables to numerical dummies
X = X.drop(columns=['Fuel_Type_CNG'], errors='ignore') # Drop column 'LNG' to avoid the multicolinearity issue
y = ToyotaCorolla_df[outcome]

In [None]:
## Partition Data into Training and Validation Data Frames with a 60:40 Split
np.random.seed(1)  # Set seed for reproducibility
train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.4, random_state=1) # Put 40% data in the validation data 
train_X = train_X.apply(lambda x: x.astype(int) if x.dtype == 'bool' else x) # convert boolean variables to numeric variables.
valid_X = valid_X.apply(lambda x: x.astype(int) if x.dtype == 'bool' else x) # convert boolean variables to numeric variables.

In [None]:
## Fit a Linear Regression Model using the LinearRegression function from the sklearn.linear_model library
ToyotaCorolla_lm = LinearRegression()
ToyotaCorolla_lm.fit(train_X, train_y)

## Print the Model Summary

### Add constant (intercept) to the features
train_X_intercept = sm.add_constant(train_X)

### Fit the linear model with the OLS function from the statsmodels library
model = sm.OLS(train_y, train_X_intercept).fit()

### Print the summary
print(model.summary())

### Interpret the linear model for your audience.

In [None]:
### If you just need the intercept and coefficients, you may first print intercept
# print("Intercept:", ToyotaCorolla_lm.intercept_)

# Then, print coefficients for each feature
# print("\nCoefficients:")
# for feature, coef in zip(train_X.columns, ToyotaCorolla_lm.coef_):
#    print(f"{feature}: {coef}")

In [None]:
# Validate the linear model

## Use predict() to make predictions on a new data frame
ToyotaCorolla_lm_pred = ToyotaCorolla_lm.predict(valid_X)
ToyotaCorolla_result = pd.DataFrame({'Predicted': ToyotaCorolla_lm_pred, 
    'Actual': valid_y, 'Residual': valid_y - ToyotaCorolla_lm_pred})
print(ToyotaCorolla_result.head(20))


In [None]:
# Residual Analysis
## Summary statistics of the residuals
### Create a percent_error column to calculate the error rate
ToyotaCorolla_result['Absolute_Residual'] = ToyotaCorolla_result['Residual'].abs()
ToyotaCorolla_result['percent_error'] = (ToyotaCorolla_result['Absolute_Residual'] / ToyotaCorolla_result['Actual'] * 100)
residual_stats = ToyotaCorolla_result[['Residual', 'percent_error']].describe().round(3) # Run the basic descriptive statistics
print(residual_stats)

In [None]:
## Plot the residuals
### Import the necessary functionality packages
import matplotlib.pyplot as plt
import seaborn as sns

### Box plot of residuals
plt.figure(figsize=(3, 4))
sns.boxplot(y=ToyotaCorolla_result['Residual'])
plt.title('Box Plot of Residuals')
plt.ylabel('Residuals')
plt.show()

In [None]:
### Histogram of residuals
plt.figure(figsize=(4, 3))
sns.histplot(ToyotaCorolla_result['Residual'], kde=True, bins=20)
plt.title('Histogram of Residuals')
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.show()

In [None]:
### Scatter plot of predicted vs residuals to check for patterns
plt.figure(figsize=(4, 3))
plt.scatter(ToyotaCorolla_result['Predicted'], ToyotaCorolla_result['Residual'], alpha=0.6, color='blue')
plt.title('Scatter Plot of Predicted vs Residuals')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.axhline(y=0, color='red', linestyle='--')  # Add horizontal line at y=0 for reference
plt.show()