In [1]:
import pandas as pd

import statsmodels.api as sm
import matplotlib.pyplot as plt
import numpy as np

In [2]:
# Import cleaned data
df=pd.read_csv('output/0_clean_arff/df_task1.csv')

In [3]:
# Remove rows without sector
df.dropna(subset='description_sector',inplace=True)

In [4]:
def get_year_quarter_combos(start_year,start_quarter,
                            end_year,end_quarter):
    ''' Will return a dataframe that can be used to inner join df
    so that you can filter the df based on year and quarter.
    '''
    all_combos=[]
    for year in range(start_year,end_year+1):
        if year==start_year:
            for quarter in range(start_quarter,5):
                combo=(year,quarter)
                all_combos.append(combo)
        elif year!=start_year and year<end_year:
            for quarter in range(1,5):
                combo=(year,quarter)
                all_combos.append(combo)
        elif year==end_year:
            for quarter in range(1,end_quarter+1):
                combo=(year,quarter)
                all_combos.append(combo)
                
    # Convert the list of tuples to a DataFrame
    filter_df = pd.DataFrame(all_combos, columns=['Year', 'Quarter'])
    
    return filter_df

filter_df=get_year_quarter_combos(2019,4,2022,2)

# Filter df based on the provided start and end year and quarter
df=df.merge(filter_df,on=['Year','Quarter'])


In [5]:
df.drop(['code_sector','Country'],axis=1,inplace=True)

In [6]:
X_cols=df[['Year','sin_quarter','cos_quarter']].columns		
Y_cols=df.drop(['description_sector','Year','sin_quarter','cos_quarter','Quarter'],axis=1).columns

In [7]:
# Get a df for each sector, same analysis as task1_a
all_dfs_list=[]

for sector in df['description_sector'].unique():
    sector_df=df[df['description_sector']==sector].copy()
    
    # Store the statistically significant columns in a df
    cols_list=[]
    coefficients_list=[]
    pvalues_list=[]

    # Fit a multiple linear regression model to each financial col
    for financial_col in Y_cols: 
        if financial_col.startswith("MI_"):
            continue  # Skip to the next iteration if the col is a missing indicator
        # Add a constant term and combine the 3 x variables into one array
        X = sm.add_constant(list(zip(sector_df['Year'], sector_df['sin_quarter'], sector_df['cos_quarter'])))  
        model = sm.OLS(sector_df[financial_col], X).fit()

        # Get model characteristics
        coefficients = model.params[1:] 
        p_values = model.pvalues[1:]
        
        # Save the column if the pvalue is significant
        if p_values.mean()<=0.05:
            # Predicted values from the model
            predicted_values = model.predict(X)

            # Add predictions to sector_df and df
            sector_df[f'Predicted {financial_col}']=predicted_values
            df[f'Predicted {financial_col}']=None # Initialize column
            df.loc[sector_df.index,f'Predicted {financial_col}']=predicted_values
            
            cols_list.append(financial_col)
            coefficients_list.append(coefficients.mean())
            pvalues_list.append(p_values.mean())
    
    sector_df_results = pd.DataFrame({'Statistical Significant Financial Indicator':cols_list,
                                'p-value':pvalues_list,
                                'slope':coefficients_list,
                                'description_sector':sector}).sort_values(['slope'],
                                                                          ascending=False).reset_index(drop=True)
    all_dfs_list.append(sector_df_results)
    
all_sectors_df=pd.concat(all_dfs_list).reset_index(drop=True)

In [8]:
task1_b_answer=all_sectors_df['Statistical Significant Financial Indicator'].value_counts()

In [9]:
task1_b_answer=pd.DataFrame({task1_b_answer.index.name:task1_b_answer.index.values,
                             'Rank':task1_b_answer}).reset_index(drop=True)

In [10]:
# Save the task1_b_answer
task1_b_answer.to_csv('output/3_task1_b/df_task1_b_result.csv',index=False)
# The plots for each column are in revo/output/task1_b

In [11]:
def plot_lin_reg_scatter(df,col,coeff,pvalue,dir):
    '''Plot for linear regression model for task1_b'''
    
    # Plot the points from y variables
    plt.scatter(df['Year'] + (df['Quarter'] - 1) / 4 ,
                df[col],
                color='blue')

    plt.xlabel('Year and Quarter')
    plt.ylabel(col)
    plt.title(col)

    # Plot the line of best fit on the fly (red)
    x_values = (df['Year'] + (df['Quarter'] - 1) / 4).astype('float') 
    y_values = df[f'Predicted {col}'].astype('float') 
    # Fit a polynomial of degree 1 (a straight line) to the data
    coefficients = np.polyfit(x_values, y_values, 1).astype('float')  
    # Create a polynomial function based on the coefficients
    poly_function = np.poly1d(coefficients)
    # Generate the x values for the line of best fit
    x_fit = np.linspace(min(x_values), max(x_values), 100)    
    # Calculate the corresponding y values using the polynomial function
    y_fit = poly_function(x_fit)    
    # Plot the line of best fit
    plt.plot(x_fit, y_fit, color='red', linestyle='-', label='Predicted Line')

    # Set formatter to suppress scientific notation on the x-axis
    plt.ticklabel_format(useOffset=False, style='plain')
    
    # Add a text box with coefficient and p-value
    text = f'Coeff: {coeff:.2f}\nP-value: {pvalue:.2f}'
    plt.text(0.05, 0.95, text, transform=plt.gca().transAxes, fontsize=10, verticalalignment='top')
    
    # Save the plot to a file
    col_no_slashes=col.replace("/", "_") # Remove the slashes
    plt.savefig(f'output/{dir}/{col_no_slashes}.png')
    # Reset the current figure
    plt.clf()
    

In [12]:
# Produce plots
for financial_col,sector in zip(all_sectors_df['Statistical Significant Financial Indicator'],
                                all_sectors_df['description_sector']):
    if financial_col.startswith("MI_"):
        continue  # Skip to the next iteration if the col is a missing indicator
    filtered_df=df[df['description_sector']==sector].copy()
    filtered_df=filtered_df[[financial_col,f'Predicted {financial_col}',
                             'Year','Quarter']]
    filtered_df['description_sector']=sector
    
    coeff=all_sectors_df[
    (all_sectors_df['description_sector']==sector) &
    (all_sectors_df['Statistical Significant Financial Indicator']==financial_col)
    ]['slope'].iloc[0]

    pvalue=all_sectors_df[
    (all_sectors_df['description_sector']==sector) &
    (all_sectors_df['Statistical Significant Financial Indicator']==financial_col)
    ]['p-value'].iloc[0]
    
    plot_lin_reg_scatter(filtered_df,financial_col,coeff,pvalue,'3_task1_b')

<Figure size 640x480 with 0 Axes>