In [None]:
#Problem 1-----------------------------------------------------------------------------------------------------------------

#Import libraries

import pandas as pd
import statsmodels.api as sm

import matplotlib.pyplot as plt
from scipy.stats import norm
import numpy as np

In [None]:
# (a)
boo = pd.read_csv("boo.csv")

# Less than 1% of rows have one or more missing values; drop them
print(f"Original rows: {len(boo)}")
boo_clean = boo.dropna()
print(f"Cleaned rows: {len(boo_clean)}")

X = boo_clean[['x1', 'x2', 'x3', 'x4', 'x5', 'x6']]
y = boo_clean['y']
X_with_intercept = sm.add_constant(X)  # Add a constant term to the model

model = sm.OLS(y, X_with_intercept).fit()
print(model.summary())


boo_fillednas = boo.fillna(boo.mean())
X_fillednas = boo_fillednas[['x1', 'x2', 'x3', 'x4', 'x5', 'x6']]
y_fillednas = boo_fillednas['y']
X_fillednas_with_intercept = sm.add_constant(X_fillednas)  # Add a cons tant term to the model

model_fillednas = sm.OLS(y_fillednas, X_fillednas_with_intercept).fit()
print(model_fillednas.summary())


In [None]:
# (b)
# Get studentized residuals
influence = model.get_influence()
studentized_residuals = influence.resid_studentized_internal

# Generate points for the standard normal distribution
normdis_range = np.linspace(min(studentized_residuals), max(studentized_residuals), 100)
normdis = norm.pdf(normdis_range, 0, 1)

# Plot the standard normal density
plt.hist(studentized_residuals,density=True, label='Histogram of Studentized Residuals', bins=15, edgecolor='black')
plt.plot(normdis_range, normdis, 'r-', lw=2, label='Standard Normal Density')

plt.xlabel('Studentized Residuals')
plt.ylabel('Density')


In [None]:
# (c)
fitted_values = model.fittedvalues
residuals = model.resid
plt.scatter(fitted_values, residuals, s=3, color='black')
plt.xlabel('Fitted values')
plt.ylabel('Residuals')
plt.show()


In [None]:
# Problem 2 ----------------------------------------------------------------------------
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import OLSInfluence
from statsmodels.nonparametric.smoothers_lowess import lowess

def plot_diagnostics(results, X, y):
    # Create influence instance
    influence = OLSInfluence(results)
    
    # Create subplots
    fig, axes = plt.subplots(2, 2, figsize=(10, 10))
    
    # 1. Residuals vs Fitted
    axes[0, 0].scatter(results.fittedvalues, results.resid, edgecolors='k', facecolors='none')
    axes[0, 0].set_xlabel('Fitted values')
    axes[0, 0].set_ylabel('Residuals')
    axes[0, 0].set_title('Residuals vs Fitted')
    # Add smoothed line of fit
    smooth_resid = lowess(results.resid, results.fittedvalues)
    axes[0, 0].plot(smooth_resid[:, 0], smooth_resid[:, 1], color='r', lw=2)
    
    # 2. Normal Q-Q
    sm.qqplot(results.resid_pearson, line='45', fit=True, ax=axes[0, 1])
    axes[0, 1].set_title('Normal Q-Q')
    
    # 3. Scale-Location
    standardized_resid = results.get_influence().resid_studentized_internal
    axes[1, 0].scatter(results.fittedvalues, np.sqrt(np.abs(standardized_resid)), edgecolors='k', facecolors='none')
    axes[1, 0].set_xlabel('Fitted values')
    axes[1, 0].set_ylabel('$\\sqrt{|Standardized residuals|}$')
    axes[1, 0].set_title('Scale-Location')
    sqrt_abs_resid = np.sqrt(np.abs(standardized_resid))
    smooth = lowess(sqrt_abs_resid, results.fittedvalues)
    axes[1, 0].plot(smooth[:, 0], smooth[:, 1], color='r', lw=2)
    
     
    # 4. Residuals vs Leverage
    axes[1, 1].scatter(influence.hat_matrix_diag, results.resid_pearson, edgecolors='k', facecolors='none')
    axes[1, 1].set_xlabel('Leverage')
    axes[1, 1].set_ylabel('Standardized residuals')
    axes[1, 1].set_title('Residuals vs Leverage')
    axes[1, 1].axhline(y=0, color='r', linestyle='--')

    # Add Cook's distance contours
    cooksx = np.linspace(0.001, max(influence.hat_matrix_diag), 100)
    p = len(results.params)
    poscooks = np.sqrt((p * (1 - cooksx)) / cooksx)
    negcooks = -np.sqrt((p * (1 - cooksx)) / cooksx)

    axes[1, 1].plot(cooksx, poscooks, 'r--', lw=1)
    axes[1, 1].plot(cooksx, negcooks, 'r--', lw=1)

    # Add annotation for Cook's distance
    axes[1, 1].annotate("Cook's distance", xy=(max(cooksx), max(poscooks)), 
                        xytext=(0, 5), textcoords='offset points', 
                        ha='right', va='bottom')
    
    plt.tight_layout()
    plt.show()

nutrition = pd.read_csv('nutrition.csv')
# Display summary statistics
summary = nutrition.describe()


# Fit the SLR model using LinearRegression
X = nutrition['age'] # Predictor variable
y = nutrition['woh']  # Response variable
# Add a constant to the predictor variable (intercept term)
X_with_intercept = sm.add_constant(X)
X_with_intercept.rename(columns={'const': 'WOH_intercept'}, inplace=True)
# Create and fit the model
initial_model = sm.OLS(y, X_with_intercept).fit()

plot_diagnostics(initial_model, X_with_intercept, y)

# Create the new variable age^2
nutrition['age2'] = nutrition['age'] ** 2
# Define the new predictor variables including age and age^2
X_new = nutrition[['age', 'age2']]
# Add a constant to the predictor variables (intercept term)
X_new_with_intercept = sm.add_constant(X_new)
# Create and fit the MLR model using OLS
model_new = sm.OLS(y, X_new_with_intercept).fit()
plot_diagnostics(model_new, X_new_with_intercept, y)

# Using interaction terms (GROUP)
# nutrition is already sorted by age
nutrition['group'] = [1 if i < 7 else 0 for i in range(len(nutrition))] # group starts at 0
# Create interaction terms
nutrition['age_group'] = nutrition['age'] * nutrition['group']
# Define the new predictor variables including age, group, and the interaction term
X_with_interaction = nutrition[['age', 'group', 'age_group']]
# Add a constant to the predictor variables (intercept term)
X_interaction_with_intercept = sm.add_constant(X_with_interaction)
# Create and fit the interaction model using OLS
model_interaction = sm.OLS(y, X_interaction_with_intercept).fit()
# Call the function to plot the regression and residuals for the interaction model
plot_diagnostics(model_interaction, X_interaction_with_intercept, y)

In [None]:
#Problem 3 -----------------------------------------------------------------------------------------------------------------

#Import libraries

import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns


In [None]:
#Load data set for problem 3 & display rows
cheese = pd.read_csv('cheese.csv')
print(cheese.head())  # Display first few rows

summary = cheese.describe() 
print(summary)

In [None]:
#Part A 
# Define predictor and response variables
display = cheese['disp']  # Predictor variable (in-store display)
sales = cheese['vol']     # Response variable (sales volume)
log_sales = np.log(sales) # Log-transformed sales volume

# Convert display to a categorical variable
display_category = display.astype('category')

# Add a constant to the predictor variable (intercept term)
X_with_intercept = sm.add_constant(display_category)
X_with_intercept.rename(columns={'const': 'sales_intercept', 'disp': 'display_in_store'}, inplace=True)

# Create and fit the model
model = sm.OLS(log_sales, X_with_intercept).fit()

# Print the summary of the model
print(model.summary())
# Plot residuals to check for patterns
plt.figure(figsize=(10, 6))
plt.scatter(model.fittedvalues, model.resid)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Fitted Values')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.show()

# Create a box plot to compare log sales for the two groups (with and without in-store displays)
plt.figure(figsize=(10, 6))
sns.boxplot(x=display, y=log_sales)
plt.xlabel('In-Store Display (0 = No, 1 = Yes)')
plt.ylabel('Log Sales Volume')
plt.title('Box Plot of Log Sales Volume by In-Store Display')
plt.show()

In [None]:
# Part b 
# Define predictor and response variables
display = cheese['disp']  # Predictor variable (in-store display)
sales = cheese['vol']     # Response variable (sales volume)
ln_sales = np.log(sales) # Log-transformed sales volume
price = cheese['price']
ln_price = np.log(price) # Log-transformed price


# Convert display to a categorical variable
display_category = display.astype('category')
price_display_interaction = display * ln_price

# New predictor variables
X = pd.DataFrame({
    'ln_price': ln_price,
    'display': display,
    'price_display_interaction': price_display_interaction
})

# Add a constant to the predictor variables (intercept term)
X_with_intercept = sm.add_constant(X)

# Create and fit the model
model = sm.OLS(log_sales, X_with_intercept).fit()

# Print the summary of the model to see the results
print(model.summary())

In [None]:
#Problem 4-----------------------------------------------------------------------------------------------------------------

#Import libraries

import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
import os
from statsmodels.stats.diagnostic import het_breuschpagan, linear_rainbow
from statsmodels.stats.stattools import durbin_watson
from scipy.stats import shapiro

In [None]:
# Directory & Display Rows
stock_data = pd.read_csv("mfunds.csv")
print(stock_data.head())

In [None]:
# Part (a): Define variables, Find Excess Return, And Run Model
data = stock_data
windsor = data['windsor']
valmrkt = data['valmrkt']
tbill = data['tbill']

# Adjust returns by subtracting the risk-free rate
windsor_excess = windsor - tbill
valmrkt_excess = valmrkt - tbill

#Run CAPM Model
X = sm.add_constant(valmrkt_excess)  # Adding a constant for the intercept
y = windsor_excess
model = sm.OLS(y, X).fit()
print(model.summary())

In [None]:
# Plotting diagnostic plots
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Residuals vs Fitted
sns.residplot(x=model.fittedvalues, y=model.resid, lowess=True, ax=axes[0, 0], line_kws={'color': 'red'})
axes[0, 0].set_title('Residuals vs Fitted')
axes[0, 0].set_xlabel('Fitted values')
axes[0, 0].set_ylabel('Residuals')

# QQ Plot
sm.qqplot(model.get_influence().resid_studentized_internal, line='45', ax=axes[0, 1])
axes[0, 1].set_title('Normal Q-Q')
axes[0, 1].set_ylabel('Standardized Residuals')

# Scale-Location Plot
sns.scatterplot(x=model.fittedvalues, y=np.sqrt(np.abs(model.get_influence().resid_studentized_internal)), ax=axes[1, 0])
axes[1, 0].set_title('Scale-Location')
axes[1, 0].set_xlabel('Fitted values')
axes[1, 0].set_ylabel('Sqrt(|Standardized Residuals|)')

# Residuals vs Leverage
sm.graphics.plot_leverage_resid2(model, ax=axes[1, 1])
axes[1, 1].set_title('Residuals vs Leverage')

plt.tight_layout()
plt.show()

In [None]:
# Influence Plot to identify influential observations
fig, ax = plt.subplots(figsize=(10, 6))
sm.graphics.influence_plot(model, ax=ax, criterion="cooks")
ax.set_title('Influence Plot')
plt.tight_layout()
plt.show()

In [None]:
# (c) Execute the Shapiro-Wilk test for normality
shapiro_test_stat, shapiro_p_value = shapiro(model.resid)
print(f"Shapiro-Wilk test statistic: {shapiro_test_stat}, p-value: {shapiro_p_value}")

if shapiro_p_value < 0.05:
    print("Reject the null hypothesis: Residuals are not normally distributed.")
else:
    print("Fail to reject the null hypothesis: Residuals are normally distributed.")

In [None]:
# (d) Execute a test for heteroskedasticity (Breusch-Pagan test)
bp_test = het_breuschpagan(model.resid, X)
bp_p_value = bp_test[3]

print(f"Breusch-Pagan test p-value: {bp_p_value}")

if bp_p_value < 0.05:
    print("Reject the null hypothesis: There is heteroskedasticity.")
else:
    print("Fail to reject the null hypothesis: No evidence of heteroskedasticity.")

# Calculating robust standard errors
robust_model = model.get_robustcov_results()
print("Robust Standard Errors:")
print(robust_model.summary())

In [None]:
# (e) Run the CAPM model on all other funds one-by-one
fund_columns = data.columns.drop(['valmrkt', 'tbill'])  # Assuming all other columns are funds
cooks_distances = {}

for fund in fund_columns:
    y_fund = data[fund] - tbill
    model_fund = sm.OLS(y_fund, X).fit()
    influence = model_fund.get_influence()
    cooks_d = influence.cooks_distance[0]
    max_cooks_idx = np.argmax(cooks_d)
    cooks_distances[fund] = (max_cooks_idx, cooks_d[max_cooks_idx])

    print(f"{fund}: Highest Cook's distance at index {max_cooks_idx} with value {cooks_d[max_cooks_idx]:.4f}")

In [None]:
# libarys for prob 5
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
import matplotlib.pyplot as plt
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Load the trades data
trades = pd.read_csv("trade.csv")
print('Number of observations:', len(trades))
print(trades.head())

In [None]:
# Reshape data to have a single country column so that we can count
melted_data = trades.melt(id_vars=['lvalue'], value_vars=['ccode1', 'ccode2'], value_name='country')

# Sum the trade volume (lvalue) for each country and find the one with the max amount
trade_volume_by_country = melted_data.groupby('country')['lvalue'].sum().sort_values(ascending=False)

country_highest_volume = trade_volume_by_country.idxmax()
max_trade_volume = trade_volume_by_country.max()
print(trade_volume_by_country.head(5))
print("Country with the highest total trade volume:", country_highest_volume)

In [None]:
# Define the dependent variable and control variables
#X = trades[['lrgdp', 'lrgdpcc', 'ldist', 'cu', 'regional', 'comlang', 'border', 'cont1', 'cont2', 'ccode1', 'ccode2']]
# Convert categorical variables to dummy variables
X = pd.get_dummies(trades[['lrgdp', 'lrgdpcc', 'ldist', 'cu', 'regional', 'comlang', 'border']], drop_first=True)
# Convert boolean columns to integers (0 and 1)
X = X.astype({col: 'int64' for col in X.select_dtypes(include='bool').columns})

X = X.apply(pd.to_numeric, errors='coerce')
X = sm.add_constant(X)  # Add a constant term for the intercept
y = trades['lvalue']
# Run the OLS regression
model = sm.OLS(y, X).fit()
print(model.summary())

X = trades[['cu', 'ldist', 'lrgdpcc', 'lrgdp', 'regional', 'comlang', 'border']]
y = trades['lvalue']
X = sm.add_constant(X)

# Run the OLS regression
model = sm.OLS(y, X).fit()
print(model.summary())

In [None]:
# Calculate VIF for each feature
vif_data = pd.DataFrame()
vif_data['Variable'] = X.columns
vif_data['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

print(vif_data)

In [None]:
# Get cluster information
trades['group'] = trades['ccode1'].astype(str) + "_" + trades['ccode2'].astype(str)

# Fit the model with clustered standard errors
clustered_fit = model.get_robustcov_results(cov_type='cluster', groups=trades['group'])

# Display the summary of the regression results
print(clustered_fit.summary())

In [None]:
# Get leverage and residuals
influence = model.get_influence()
leverage = influence.hat_matrix_diag
residuals = influence.resid_studentized_external

# Create a leverage plot
plt.figure(figsize=(10, 6))
plt.scatter(leverage, residuals, alpha=0.5)
plt.axhline(0, linestyle='--', color='red', linewidth=2)
plt.axvline(0.04, linestyle='--', color='green', linewidth=2, label='High Leverage Cutoff (0.04)')
plt.xlabel('Leverage')
plt.ylabel('Studentized Residuals')
plt.title('Leverage vs. Studentized Residuals')
plt.legend()
plt.grid()
plt.show()

high_leverage_indices = trades[leverage > 0.04].index
high_leverage_countries = trades.iloc[high_leverage_indices]
print(high_leverage_countries)
