In [82]:
import pandas as pd

import statsmodels.api as sm
import matplotlib.pyplot as plt
import numpy as np

In [83]:
# Import cleaned data
df=pd.read_csv('output/df_task1.csv')

In [84]:
# Filter dataframe to only contain between Q4 2019 and Q2 2020
df = df[(df['Year'] == 2019) & (df['Quarter'] == 4) |
        (df['Year'] == 2020) & (df['Quarter'] == 1) |
        (df['Year'] == 2020) & (df['Quarter'] == 2)].reset_index(drop=True)

In [85]:
df.drop(['code_sector','description_sector','Country'],axis=1,inplace=True)

In [86]:
X_cols=df.drop(['Year','sin_quarter','cos_quarter'],axis=1).columns
Y_cols=df[['Year','sin_quarter','cos_quarter']].columns		

In [87]:
df

Unnamed: 0,Net profit/total assets,Total liabilities/total assets,Working capital/total assets,Current assets/short-term liabilities,Retained earnings/total assets,Gross profit/total assets,Book value of equity/total liabilities,Net sales revenue/total assets,Equity/total assets,(Gross profit + financial expenses)/total assets,...,Net profit (n)/net profit (n−1),Inventory (n)/inventory (n−1),Receivables (n)/receivables (n−1),short-term liabilities (n)/short-term liabilities (n−1),Net cash flow from (used in) operating activities (n)/Net cash flow from (used in) operating activities (n−1),Net cash flow(n)/net cash flow (n−1),Year,Quarter,sin_quarter,cos_quarter
0,0.00,0.00,0.00,0.00,0.000,0.0,0.00,0.00,0.00,0.000,...,0.00,0.00,0.00,0.000,0.00,0.000,2019.0,4.0,-2.449294e-16,1.0
1,-0.01,0.83,-0.08,0.83,0.000,0.0,0.21,0.28,0.17,-0.010,...,0.78,0.96,0.97,1.080,0.00,-1.015,2019.0,4.0,-2.449294e-16,1.0
2,0.00,0.73,0.04,1.06,0.100,0.0,0.37,0.45,0.27,0.000,...,-0.06,1.00,1.02,1.000,0.00,-2.030,2019.0,4.0,-2.449294e-16,1.0
3,0.00,0.00,0.00,0.00,0.000,0.0,0.00,0.00,0.00,0.000,...,0.00,0.00,0.00,0.000,0.00,0.000,2019.0,4.0,-2.449294e-16,1.0
4,0.00,0.00,0.00,0.00,0.000,0.0,0.00,0.00,0.00,0.000,...,0.00,0.00,0.00,0.000,0.00,0.000,2019.0,4.0,-2.449294e-16,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1048,0.00,1.68,-0.24,0.36,-0.210,0.0,-0.40,0.01,-0.68,0.000,...,-0.42,0.16,2.66,0.470,0.00,0.000,2020.0,2.0,1.224647e-16,-1.0
1049,0.00,0.88,-0.28,0.42,-0.370,0.0,0.13,0.06,0.12,0.010,...,-1.03,1.08,0.82,0.795,1.15,1.840,2020.0,2.0,1.224647e-16,-1.0
1050,0.00,1.57,-0.57,0.64,-0.185,0.0,-0.36,0.07,-0.57,0.005,...,-1.64,1.04,0.84,1.120,-1.99,0.920,2020.0,2.0,1.224647e-16,-1.0
1051,0.00,0.00,0.00,0.00,0.000,0.0,0.00,0.00,0.00,0.000,...,0.00,0.00,0.00,0.000,0.00,0.000,2020.0,2.0,1.224647e-16,-1.0


In [77]:
def plot_lin_reg_scatter(df,col,coeff,pvalue):
    '''Plot for linear regression model.'''
    
    plt.scatter(df['Year'] + (df['Quarter'] - 1) / 4 ,
                df[col],
                color='blue', label='Actual')

    # Scatter plot of Predicted values (red)
    plt.scatter(df['Year'] + (df['Quarter'] - 1) / 4  ,
                df[f'Predicted {col}'],
                color='red', label='Predicted')

    plt.xlabel('Year and Quarter')
    plt.ylabel(col)
    plt.title('Actual (blue) vs Predicted (red) Values')

    # Plot the predicted line (red)
    plt.plot(df['Year'] + (df['Quarter'] - 1) / 4 , 
            df[f'Predicted {col}'],
            color='red', linestyle='-', label='Predicted Line')

    # Set formatter to suppress scientific notation on the x-axis
    plt.ticklabel_format(useOffset=False, style='plain')
    
    # Add a text box with coefficient and p-value
    text = f'Coeff: {coeff:.2f}\nP-value: {pvalue:.2f}'
    plt.text(0.05, 0.95, text, transform=plt.gca().transAxes, fontsize=10, verticalalignment='top')
    
    # Save the plot to a file
    col_no_slashes=col.replace("/", "_") # Remove the slashes
    plt.savefig(f'output/plots/task1_a/{col_no_slashes}.png')
    # Reset the current figure
    plt.clf()
    

In [81]:
# Store the statistically significant columns in a df
cols_list=[]
coefficients_list=[]
pvalues_list=[]

# Fit a multiple linear regression model to each financial col
for financial_col in X_cols: 
    # Add a constant term and combine the 3 x variables into one array
    X = sm.add_constant(list(zip(df['Year'], df['sin_quarter'], df['cos_quarter'])))  
    model = sm.OLS(df[financial_col], X).fit()

    # Get the coefficients and p-values
    coefficients = model.params[1:] 
    p_values = model.pvalues[1:]
    
    
    # Save the column if the pvalue is significant
    if p_values.mean()<=0.05:
        # Predicted values from the model
        predicted_values = model.predict(X)
        # Add predictions to df
        df[f'Predicted {financial_col}']=predicted_values
        
        cols_list.append(financial_col)
        coefficients_list.append(coefficients.mean())
        pvalues_list.append(p_values.mean())
        
        plot_lin_reg_scatter(df,financial_col,coefficients.mean(),p_values.mean())
        
    


<Figure size 640x480 with 0 Axes>

In [79]:
predicted_values

array([0.01842857, 0.01842857, 0.01842857, ..., 0.03605634, 0.03605634,
       0.03605634])

In [None]:
predicted_values

In [None]:
p_values