In [79]:
import pandas as pd

import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import numpy as np

In [80]:
# Import cleaned data
df=pd.read_csv('output/df_task1.csv')

In [81]:
def get_year_quarter_combos(start_year,start_quarter,
                            end_year,end_quarter):
    ''' Will return a dataframe that can be used to inner join df
    so that you can filter the df based on year and quarter.
    '''
    all_combos=[]
    for year in range(start_year,end_year+1):
        if year==start_year:
            for quarter in range(start_quarter,5):
                combo=(year,quarter)
                all_combos.append(combo)
        elif year!=start_year and year<end_year:
            for quarter in range(1,5):
                combo=(year,quarter)
                all_combos.append(combo)
        elif year==end_year:
            for quarter in range(1,end_quarter+1):
                combo=(year,quarter)
                all_combos.append(combo)
                
    # Convert the list of tuples to a DataFrame
    filter_df = pd.DataFrame(all_combos, columns=['Year', 'Quarter'])
    
    return filter_df

filter_df=get_year_quarter_combos(2019,4,2022,2)


In [82]:
# Filter df based on the provided start and end year and quarter
df=df.merge(filter_df,on=['Year','Quarter'])

In [83]:
df.drop(['code_sector','description_sector','Country'],axis=1,inplace=True)

In [84]:
X_cols=df[['Year','sin_quarter','cos_quarter']].columns		
Y_cols=df.drop(['Year','sin_quarter','cos_quarter','Quarter'],axis=1).columns

In [85]:
# As we are going to use linear regression, let s normalize the data
scaler = StandardScaler()
# Normalize
df[Y_cols] = scaler.fit_transform(df[Y_cols])

In [86]:
def plot_lin_reg_scatter(df,col,coeff,pvalue):
    '''Plot for linear regression model. The result is not a line
    because we are using 3 x variables, so we are mapping a 4d space to
    a 2d space.'''
    
    # Plot the points from y variables
    plt.scatter(df['Year'] + (df['Quarter'] - 1) / 4 ,
                df[col],
                color='blue')

    plt.xlabel('Year and Quarter')
    plt.ylabel(col)
    plt.title(col)

    # Plot the line of best fit on the fly (red)
    x_values = df['Year'] + (df['Quarter'] - 1) / 4
    y_values = df[f'Predicted {col}']    
    # Fit a polynomial of degree 1 (a straight line) to the data
    coefficients = np.polyfit(x_values, y_values, 1)    
    # Create a polynomial function based on the coefficients
    poly_function = np.poly1d(coefficients)    
    # Generate the x values for the line of best fit
    x_fit = np.linspace(min(x_values), max(x_values), 100)    
    # Calculate the corresponding y values using the polynomial function
    y_fit = poly_function(x_fit)    
    # Plot the line of best fit
    plt.plot(x_fit, y_fit, color='red', linestyle='-', label='Predicted Line')

    # Set formatter to suppress scientific notation on the x-axis
    plt.ticklabel_format(useOffset=False, style='plain')
    
    # Add a text box with coefficient and p-value
    text = f'Coeff: {coeff:.2f}\nP-value: {pvalue:.2f}'
    plt.text(0.05, 0.95, text, transform=plt.gca().transAxes, fontsize=10, verticalalignment='top')
    
    # Save the plot to a file
    col_no_slashes=col.replace("/", "_") # Remove the slashes
    plt.savefig(f'output/plots/task1_a/{col_no_slashes}.png')
    # Reset the current figure
    plt.clf()
    

In [87]:

# Store the statistically significant columns in a df
cols_list=[]
coefficients_list=[]
pvalues_list=[]

# Fit a multiple linear regression model to each financial col
for financial_col in Y_cols: 
    # Add a constant term and combine the 3 x variables into one array
    X = sm.add_constant(list(zip(df['Year'], df['sin_quarter'], df['cos_quarter'])))  
    model = sm.OLS(df[financial_col], X).fit()

    # Get model characteristics
    coefficients = model.params[1:] 
    p_values = model.pvalues[1:]
    
    # Save the column if the pvalue is significant
    if p_values.mean()<=0.05:
        # Predicted values from the model
        predicted_values = model.predict(X)
        # Add predictions to df
        df[f'Predicted {financial_col}']=predicted_values
        
        cols_list.append(financial_col)
        coefficients_list.append(coefficients.mean())
        pvalues_list.append(p_values.mean())
        
        # Save all the plots of statistical significant cols in filesystem
        plot_lin_reg_scatter(df,financial_col,coefficients.mean(),
                             p_values.mean())
        

<Figure size 640x480 with 0 Axes>

In [88]:
task1_a_answer = pd.DataFrame({'Statistical Significant Financial Indicator':cols_list,
                                'p-value':pvalues_list,
                                'slope':coefficients_list}).sort_values(['slope'],ascending=False).reset_index(drop=True)

In [89]:
task1_a_answer
# The plots for each column are in revo/output/plots/task1_a

Unnamed: 0,Statistical Significant Financial Indicator,p-value,slope
0,Net profit (n)/net profit (n−1),0.042271,0.048739
1,Market capitalization/EBITDA,0.002783,0.031731
2,Net cash flow from (used in) operating activit...,0.046557,0.03136
3,Market capitalization to EBIT,0.015089,0.014052
4,EBIT/total operating costs,0.042104,-0.023117
