# Predictive Modeling for Business Forecasting
  27 February 2024

# Load the dataset and perform data preprocessing

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.impute import SimpleImputer

In [5]:
distress = pd.read_csv('Financial Distress.csv')
distress.head(10)

Unnamed: 0,Company,Time,Financial Distress,x1,x2,x3,x4,x5,x6,x7,...,x74,x75,x76,x77,x78,x79,x80,x81,x82,x83
0,1,1,0.010636,1.281,0.022934,0.87454,1.2164,0.06094,0.18827,0.5251,...,85.437,27.07,26.102,16.0,16.0,0.2,22,0.06039,30,49
1,1,2,-0.45597,1.27,0.006454,0.82067,1.0049,-0.01408,0.18104,0.62288,...,107.09,31.31,30.194,17.0,16.0,0.4,22,0.010636,31,50
2,1,3,-0.32539,1.0529,-0.059379,0.92242,0.72926,0.020476,0.044865,0.43292,...,120.87,36.07,35.273,17.0,15.0,-0.2,22,-0.45597,32,51
3,1,4,-0.56657,1.1131,-0.015229,0.85888,0.80974,0.076037,0.091033,0.67546,...,54.806,39.8,38.377,17.167,16.0,5.6,22,-0.32539,33,52
4,2,1,1.3573,1.0623,0.10702,0.8146,0.83593,0.19996,0.0478,0.742,...,85.437,27.07,26.102,16.0,16.0,0.2,29,1.251,7,27
5,2,2,0.007188,1.0558,0.081916,0.87949,0.68673,0.14263,0.043102,0.77198,...,107.09,31.31,30.194,17.0,16.0,0.4,29,1.3573,8,28
6,2,3,1.2002,0.97059,0.076064,0.90677,0.8098,0.16592,-0.024649,0.7366,...,120.87,36.07,35.273,17.0,15.0,-0.2,29,0.007188,9,29
7,2,4,2.2348,1.059,0.1302,0.81811,0.87599,0.23445,0.045576,0.78727,...,54.806,39.8,38.377,17.167,16.0,5.6,29,1.2002,10,30
8,2,5,1.3405,1.1245,0.14784,0.75871,1.0799,0.27644,0.089408,0.80356,...,59.806,44.53,42.822,15.5,14.0,2.1,29,2.2348,11,31
9,2,6,2.0474,1.5998,0.26246,0.54615,1.3127,0.36948,0.29664,0.85364,...,66.262,52.74,49.206,15.5,12.0,-6.4,29,1.3405,12,32


In [6]:
# Missing values
print(distress.isnull().sum())

Company               0
Time                  0
Financial Distress    0
x1                    0
x2                    0
                     ..
x79                   0
x80                   0
x81                   0
x82                   0
x83                   0
Length: 86, dtype: int64


# Feature Selection and Engineering

In [None]:
# Correlation matrix to identify highly correlated features
correlation_matrix = distress.corr().abs()
upper_tri = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(np.bool))
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.95)]
distress.drop(to_drop, axis=1, inplace=True)

In [None]:
# Define features and target variable
x = distress.drop(columns=['Financial Distress'])
y = distress['Financial Distress']

In [11]:
# Handle missing values: impute with the mean of each column
imputer = SimpleImputer(strategy='mean')
x_imputed = imputer.fit_transform(x)

In [None]:
# Feature Engineering: Create polynomial features
poly = PolynomialFeatures(degree=2, include_bias=False)
x_poly = poly.fit_transform(x_imputed)

In [None]:
# Create additional time-related features
distress['month'] = pd.to_datetime(distress['Time']).dt.month
distress['day'] = pd.to_datetime(distress['Time']).dt.day
distress['hour'] = pd.to_datetime(distress['Time']).dt.hour
distress['x1_lag1'] = distress['x1'].shift(1)

# Model Selection

In [None]:
# Split the data
x_train, x_test, y_train, y_test = train_test_split(x_poly, y, test_size=0.2, random_state=42)

In [14]:
# Handle missing values in features
imputer = SimpleImputer(strategy='mean')
x_train_imputed = imputer.fit_transform(x_train)
x_test_imputed = imputer.transform(x_test)

In [None]:
# Define models to evaluate
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest Regressor": RandomForestRegressor(random_state=42)

In [None]:
# Evaluate models
for name, model in models.items():
    model.fit(x_train, y_train)
    y_predict = model.predict(x_test)
    mse = mean_squared_error(y_test, y_predict)
    print(f"{name}: Mean Squared Error = {mse:.2f}")

# # Model Evaluation

In [None]:
# Model Definition
model = RandomForestRegressor(random_state=42)

In [None]:
# Model Training
model.fit(x_train_imputed, y_train)

In [None]:
# Predictions on tested data
y_prediction = model.predict(x_tested_imputed)

In [None]:
# Evaluate model: MSE
mse = mean_squared_error(y_test, y_predict)
print(f"Mean Squared Error: {mse: .2f}")

# # Predictive Insights

In [None]:
# Get feature importance from the Random Forest model
feature_importance = model.feature_importances_

In [None]:
# Sort feature importance in descending order
sorted_idx = np.argsort(feature_importance)[::-1]

In [None]:
# Print feature names and their importance scores
for i in sorted_idx:
    print(f"{x.columns[i]}: {feature_importance[i]}")

In [None]:
# Calculate residuals and identify instances with high prediction errors
residuals = y_test - y_predict
high_error_indices = np.argsort(np.abs(residuals))[::-1][:10]

In [None]:
# Print feature values and their corresponding actual and predicted values for high-error instances
for idx in high_error_indices:
    print(f"Instance {idx}:")
    print(f"Actual Value: {y_test.iloc[idx]}")
    print(f"Predicted Value: {y_predict[idx]}")
    print(f"Features: {x_test[idx]}")