In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/financial-performance-prediction/sample_submission.csv
/kaggle/input/financial-performance-prediction/data_dictionary.txt
/kaggle/input/financial-performance-prediction/train.csv
/kaggle/input/financial-performance-prediction/test.csv


## Importing Libraries

In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, KFold, GridSearchCV, RandomizedSearchCV, cross_val_score
from category_encoders import TargetEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import KNNImputer


from sklearn.linear_model import LinearRegression

from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

from sklearn.metrics import (r2_score, mean_absolute_percentage_error)
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

## Fucntions

In [3]:
def basic_info(df):

    """
    Takes a DataFrame as input, and gives the basic info like shape, missing values count, duplicated rows, unique values and dtypes of the features
    
    Args:
        df (pandas.DataFrame): The DataFrame for which you want the details of

    Returns: 
        None 
    """
    print(f"shape of the date : \n\trows = {df.shape[0]}, columns = {df.shape[1]}\n")
    missing_val_count = df.isna().sum().sum()
    print(f"missing values: \n\tcount = {missing_val_count}")
    if missing_val_count != 0:
        missing_data = df.isna().sum().reset_index().rename({"index" : "feature", 0 : "missing_val_count"}, axis = 1)
        missing_data =  missing_data[missing_data.missing_val_count > 0]
        missing_data["missing_val_percentage"] = np.round((missing_data["missing_val_count"] / df.shape[0]) * 100, 2)
        missing_data = missing_data.sort_values(by = "missing_val_count", ascending = False)
        display(missing_data)

    print(f"duplicated records: \n\tcount = {df.duplicated().sum()}\n")
    print(f"Unique Values : ")
    nunique_vals = df.nunique().reset_index().rename({"index" : "feature", 0 : "nunique_vals"}, axis = 1)
    display(nunique_vals)

    display(df.dtypes.reset_index().rename(columns = {"index" : "fetaure", 0 : "data type"}))

    return missing_data

In [4]:
def num_fillna(df, given_info, feature_median, col):
    required_data = given_info[(given_info.industry == df.industry) & (given_info.sector == df.sector)][col]
    if required_data.shape[0] == 0:
      required_data = given_info[(given_info.industry == df.industry)][col]
    elif required_data.shape[0] == 0:
      required_data = given_info[(given_info.sector == df.sector)][col]
    else:
         return feature_median
    return required_data.iloc[0]   

## reading data and finding basic info

In [5]:
train = pd.read_csv('/kaggle/input/financial-performance-prediction/train.csv')
test = pd.read_csv('/kaggle/input/financial-performance-prediction/test.csv')

In [6]:
train.head()

Unnamed: 0,Id,industry,sector,fullTimeEmployees,auditRisk,boardRisk,compensationRisk,shareHolderRightsRisk,overallRisk,trailingPE,...,Q10_TOTAL_STOCKHOLDERS_EQUITY,Q10_NET_INCOME,Q10_GROSS_PROFIT,Q10_COST_OF_REVENUES,Q10_REVENUES,Q10_OPERATING_INCOME,Q10_OPERATING_EXPENSES,Q10_EBITDA,Q10_DEPRECIATION_AND_AMORTIZATION,Q10_fiscal_year_end
0,196,Personal Services,Consumer Cyclical,1174.0,5.0,10.0,9.0,4.0,8.0,12.549223,...,240502000.0,,23171000.0,54319000.0,40732000.0,2150000.0,5946000.0,17225000.0,15075000.0,0.0
1,1568,Building Products & Equipment,Industrials,3600.0,4.0,4.0,3.0,4.0,3.0,1222.0,...,326538000.0,-71929000.0,410574000.0,692688000.0,1103262000.0,-39287000.0,333229000.0,77345000.0,116632000.0,0.0
2,1218,,Unknown,,,,,,,,...,40663000000.0,603000000.0,1649000000.0,4587000000.0,6236000000.0,663000000.0,214000000.0,1435000000.0,772000000.0,0.0
3,23,Scientific & Technical Instruments,Technology,143.0,,,,,,,...,2758.0,-9715.0,0.0,,,-9683.0,9683.0,-9683.0,0.0,0.0
4,783,Drug Manufacturers - Specialty & Generic,Healthcare,36.0,,,,,,1.231544,...,25924000.0,-4676000.0,12050000.0,0.0,12050000.0,-1594000.0,13644000.0,-1594000.0,0.0,0.0


In [7]:
test.head()

Unnamed: 0,Id,industry,sector,fullTimeEmployees,auditRisk,boardRisk,compensationRisk,shareHolderRightsRisk,overallRisk,trailingPE,...,Q10_TOTAL_STOCKHOLDERS_EQUITY,Q10_NET_INCOME,Q10_GROSS_PROFIT,Q10_COST_OF_REVENUES,Q10_REVENUES,Q10_OPERATING_INCOME,Q10_OPERATING_EXPENSES,Q10_EBITDA,Q10_DEPRECIATION_AND_AMORTIZATION,Q10_fiscal_year_end
0,480,Drug Manufacturers - Specialty & Generic,Healthcare,,,,,,,,...,,,,,,,,,,0.0
1,573,Drug Manufacturers - Specialty & Generic,Healthcare,7600.0,6.0,7.0,6.0,4.0,5.0,90.5,...,323015000.0,,191562000.0,301543000.0,493105000.0,45907000.0,90726000.0,100836000.0,54929000.0,0.0
2,1967,Software - Application,Technology,,,,,,,,...,,,,,,,,,,0.0
3,57,Marine Shipping,Industrials,5200.0,1.0,5.0,4.0,5.0,3.0,23.500002,...,3085267000.0,-3375000.0,72793000.0,228158000.0,300951000.0,3164000.0,69629000.0,3164000.0,0.0,0.0
4,131,Metal Fabrication,Industrials,6700.0,1.0,4.0,4.0,6.0,3.0,20.748793,...,482900000.0,-7900000.0,85800000.0,606700000.0,692500000.0,31800000.0,54000000.0,31800000.0,0.0,0.0


In [8]:
missing_vals_train = basic_info(train)

shape of the date : 
	rows = 1624, columns = 212

missing values: 
	count = 19283


Unnamed: 0,feature,missing_val_count,missing_val_percentage
9,trailingPE,902,55.54
4,auditRisk,493,30.36
5,boardRisk,493,30.36
6,compensationRisk,493,30.36
7,shareHolderRightsRisk,493,30.36
...,...,...,...
59,Q2_TOTAL_ASSETS,1,0.06
65,Q2_TOTAL_LIABILITIES_AND_EQUITY,1,0.06
66,Q2_TOTAL_STOCKHOLDERS_EQUITY,1,0.06
96,Q4_TOTAL_LIABILITIES,1,0.06


duplicated records: 
	count = 0

Unique Values : 


Unnamed: 0,feature,nunique_vals
0,Id,1624
1,industry,113
2,sector,10
3,fullTimeEmployees,971
4,auditRisk,10
...,...,...
207,Q10_OPERATING_INCOME,1397
208,Q10_OPERATING_EXPENSES,1402
209,Q10_EBITDA,1398
210,Q10_DEPRECIATION_AND_AMORTIZATION,559


Unnamed: 0,fetaure,data type
0,Id,int64
1,industry,object
2,sector,object
3,fullTimeEmployees,float64
4,auditRisk,float64
...,...,...
207,Q10_OPERATING_INCOME,float64
208,Q10_OPERATING_EXPENSES,float64
209,Q10_EBITDA,float64
210,Q10_DEPRECIATION_AND_AMORTIZATION,float64


In [9]:
missing_vals_test = basic_info(test)

shape of the date : 
	rows = 406, columns = 203

missing values: 
	count = 5791


Unnamed: 0,feature,missing_val_count,missing_val_percentage
9,trailingPE,217,53.45
4,auditRisk,132,32.51
5,boardRisk,132,32.51
6,compensationRisk,132,32.51
7,shareHolderRightsRisk,132,32.51
...,...,...,...
107,Q5_TOTAL_LIABILITIES_AND_EQUITY,1,0.25
91,Q4_TOTAL_STOCKHOLDERS_EQUITY,1,0.25
74,Q3_TOTAL_STOCKHOLDERS_EQUITY,1,0.25
125,Q6_TOTAL_STOCKHOLDERS_EQUITY,1,0.25


duplicated records: 
	count = 0

Unique Values : 


Unnamed: 0,feature,nunique_vals
0,Id,406
1,industry,88
2,sector,10
3,fullTimeEmployees,311
4,auditRisk,10
...,...,...
198,Q10_OPERATING_INCOME,333
199,Q10_OPERATING_EXPENSES,334
200,Q10_EBITDA,330
201,Q10_DEPRECIATION_AND_AMORTIZATION,140


Unnamed: 0,fetaure,data type
0,Id,int64
1,industry,object
2,sector,object
3,fullTimeEmployees,float64
4,auditRisk,float64
...,...,...
198,Q10_OPERATING_INCOME,float64
199,Q10_OPERATING_EXPENSES,float64
200,Q10_EBITDA,float64
201,Q10_DEPRECIATION_AND_AMORTIZATION,float64


In [10]:
train.describe()

Unnamed: 0,Id,fullTimeEmployees,auditRisk,boardRisk,compensationRisk,shareHolderRightsRisk,overallRisk,trailingPE,forwardPE,floatShares,...,Q10_TOTAL_STOCKHOLDERS_EQUITY,Q10_NET_INCOME,Q10_GROSS_PROFIT,Q10_COST_OF_REVENUES,Q10_REVENUES,Q10_OPERATING_INCOME,Q10_OPERATING_EXPENSES,Q10_EBITDA,Q10_DEPRECIATION_AND_AMORTIZATION,Q10_fiscal_year_end
count,1624.0,1544.0,1131.0,1131.0,1131.0,1131.0,1131.0,722.0,1436.0,1598.0,...,1449.0,1305.0,1423.0,1321.0,1325.0,1427.0,1430.0,1421.0,1419.0,1624.0
mean,1012.267857,9345.583549,5.564103,5.654288,5.797524,5.58267,5.733864,inf,inf,226069400.0,...,2165070000.0,74836230.0,454767600.0,892565300.0,1355588000.0,119598900.0,383976500.0,68864420.0,-51369510.0,0.004926
std,585.242103,34273.510554,2.85026,2.857969,2.839455,2.769783,2.879112,,,3579918000.0,...,8504373000.0,517313400.0,2242473000.0,4225508000.0,5776379000.0,678874900.0,1677897000.0,1775192000.0,1728854000.0,0.070035
min,0.0,1.0,1.0,1.0,1.0,1.0,1.0,8.6e-05,-2085.0,103469.0,...,-18058000000.0,-3944000000.0,-21708000000.0,-1880579000.0,-4500000.0,-4475000000.0,-1439000.0,-50905000000.0,-54182000000.0,0.0
25%,511.75,141.0,3.0,3.0,3.0,3.0,3.0,12.575714,-2.843448,15007460.0,...,22063000.0,-10888000.0,1615874.0,1202000.0,9223996.0,-7866176.0,7696162.0,-10612000.0,0.0,0.0
50%,1009.5,878.0,6.0,6.0,6.0,6.0,6.0,21.599812,9.378077,41034220.0,...,236537000.0,-203624.0,34371000.0,29869000.0,105079000.0,-54815.0,38214000.0,-286224.0,0.0,0.0
75%,1519.25,5040.75,8.0,8.0,8.0,8.0,8.0,37.157093,19.998548,106294900.0,...,997618000.0,24113000.0,210978500.0,311639000.0,603531000.0,42511500.0,174040200.0,37600000.0,0.0,0.0
max,2029.0,500000.0,10.0,10.0,10.0,10.0,10.0,inf,inf,142135700000.0,...,162282900000.0,9497000000.0,29572000000.0,66177000000.0,92400000000.0,11378000000.0,29197000000.0,26568000000.0,25514000000.0,1.0


In [11]:
test.describe()

Unnamed: 0,Id,fullTimeEmployees,auditRisk,boardRisk,compensationRisk,shareHolderRightsRisk,overallRisk,trailingPE,forwardPE,floatShares,...,Q10_TOTAL_STOCKHOLDERS_EQUITY,Q10_NET_INCOME,Q10_GROSS_PROFIT,Q10_COST_OF_REVENUES,Q10_REVENUES,Q10_OPERATING_INCOME,Q10_OPERATING_EXPENSES,Q10_EBITDA,Q10_DEPRECIATION_AND_AMORTIZATION,Q10_fiscal_year_end
count,406.0,384.0,274.0,274.0,274.0,274.0,274.0,189.0,352.0,402.0,...,350.0,318.0,338.0,321.0,321.0,336.0,339.0,336.0,332.0,406.0
mean,1023.428571,8560.729167,5.540146,5.624088,5.923358,5.518248,5.759124,inf,inf,163002000.0,...,3278283000.0,78592570.0,651039200.0,852732000.0,1530937000.0,138191800.0,365195300.0,286412300.0,144951600.0,0.012315
std,590.432542,27004.049445,2.837516,2.835077,2.890803,2.820911,2.884912,,,559985600.0,...,11633430000.0,472926600.0,3871916000.0,3293652000.0,6249743000.0,636069900.0,1096210000.0,3092154000.0,2686686000.0,0.110425
min,2.0,2.0,1.0,1.0,1.0,1.0,1.0,1e-05,-844.5,658244.0,...,-2309400000.0,-1973000000.0,-1181000000.0,-907000000.0,0.0,-1558000000.0,1100.0,-3024000000.0,-4564000000.0,0.0
25%,482.75,114.0,3.0,3.0,3.25,3.0,3.0,11.153847,-2.025302,15648190.0,...,13209930.0,-10989830.0,1317500.0,800000.0,5049000.0,-8700250.0,7633714.0,-12539500.0,0.0,0.0
50%,1045.0,838.0,6.0,6.0,6.0,6.0,6.0,20.783356,9.615789,44199330.0,...,271299000.0,-221000.0,32335000.0,25040000.0,87710000.0,275515.5,46392000.0,-230693.0,0.0,0.0
75%,1538.75,5550.0,8.0,8.0,8.0,8.0,8.0,36.75806,17.707155,111594000.0,...,1357392000.0,32536750.0,281015000.0,384200000.0,699054000.0,62493500.0,181520500.0,50553250.0,0.0,0.0
max,2026.0,339341.0,10.0,10.0,10.0,10.0,10.0,inf,inf,8945318000.0,...,127487100000.0,5105000000.0,62624000000.0,40894000000.0,70196000000.0,6739000000.0,10223000000.0,52401000000.0,45662000000.0,1.0


- there are some outlers,
- and there are some inf values

In [12]:
target_cols = [col for col in train.columns if col not in test.columns]

In [13]:
train.replace([np.inf, -np.inf], np.nan, inplace=True)
test.replace([np.inf, -np.inf], np.nan, inplace=True)

## Data Preprocessing

In [14]:
meta_data_cols = [col for col in train.columns if train[col].dtype != "O" and col[0] != "Q"]
meta_data = train[meta_data_cols]
all_nan_metadata = train[meta_data.isna().all(axis=1)]

train = train.drop(all_nan_metadata.index, axis = 0)
train.shape

(1624, 212)

In [15]:
missing_cat_cols = []
for col in missing_vals_train.feature:
    if train[col].dtype == "O":
        missing_cat_cols.append(col)
        display(missing_vals_train[missing_vals_train.feature == col])

Unnamed: 0,feature,missing_val_count,missing_val_percentage
1,industry,22,1.35


Unnamed: 0,feature,missing_val_count,missing_val_percentage
20,recommendationKey,20,1.23


Unnamed: 0,feature,missing_val_count,missing_val_percentage
31,financialCurrency,20,1.23


In [16]:
for col in [col for col in train.columns if train[col].dtype == "O"]:        
    train[col] = train[col].fillna("Unknown")
    test[col] = test[col].fillna("Unknown")

In [17]:
num_cols = train.columns[train.isna().sum() > 0]
given_info = train.groupby(by = ["industry", "sector"])[num_cols].apply("median").reset_index()
# given_info.head(10)

In [18]:
quarters = ["Q"+str(i) for i in range (0,11)]

quater_wise_cols = {}
for col in [col for col in train.columns if train[col].dtype != "O"]:
    if col[0] == "Q":
        if col[1] + col[2] != "10" and col[1] != "0":
            if col[0:2] in quater_wise_cols.keys():
                quater_wise_cols[col[0:2]].append(col)
            else:
                quater_wise_cols[col[0:2]] = [col] 
        else:
            if "Q10" in col:
                if col[0:3] in quater_wise_cols.keys():
                    quater_wise_cols[col[0:3]].append(col)
                else:
                    quater_wise_cols[col[0:3]] = [col]
            else:
                if col[0:2] in quater_wise_cols.keys():
                    quater_wise_cols[col[0:2]].append(col)
                else:
                    quater_wise_cols[col[0:2]] = [col]  

for quarter in quater_wise_cols.keys():
    print(f"{quarter} : {len(quater_wise_cols[quarter])}")

Q0 : 10
Q1 : 17
Q2 : 17
Q3 : 17
Q4 : 17
Q5 : 17
Q6 : 17
Q7 : 17
Q8 : 17
Q9 : 17
Q10 : 17


In [19]:
neighbour_quarters = {
        "Q2" : ["Q1", "Q3"],
        "Q3" : ["Q2", "Q4"],
        "Q4" : ["Q3", "Q5"],
        "Q5" : ["Q4", "Q6"],
        "Q6" : ["Q5", "Q7"],
        "Q7" : ["Q6", "Q8"],
        "Q8" : ["Q7", "Q8"],
        "Q9" : ["Q8", "Q9"]
}


In [20]:
for col in [col for col in train.columns if train[col].dtype != "O" and col[0] == "Q"]:
    if "Q0" not in col and "Q10" not in col and "Q1" not in col:
        seeing_quarter = col[0:2]
        neighbour_quarter = neighbour_quarters[seeing_quarter]
        feature = col[2:]

        left_neighbour = train.copy()
        right_neighbour = train.copy()

        train_median_left = left_neighbour[neighbour_quarter[0] + feature].median()
        train_median_right = right_neighbour[neighbour_quarter[1] + feature].median()


        left_neighbour[neighbour_quarter[0] + feature] = left_neighbour[neighbour_quarter[0] + feature].fillna(train_median_left)
        right_neighbour[neighbour_quarter[1] + feature] = right_neighbour[neighbour_quarter[1] + feature].fillna(train_median_right)
    
        left_neighbour = left_neighbour.loc[train[col].isna(), neighbour_quarter[0] + feature]
        right_neighbour = right_neighbour.loc[train[col].isna(), neighbour_quarter[1] + feature]

        train.loc[train[col].isna(), col] = (right_neighbour + 2*(left_neighbour))/3

        left_neighbour = test.copy()
        right_neighbour = test.copy()

        left_neighbour[neighbour_quarter[0] + feature] = left_neighbour[neighbour_quarter[0] + feature].fillna(train_median_left)
        right_neighbour[neighbour_quarter[1] + feature] = right_neighbour[neighbour_quarter[1] + feature].fillna(train_median_right)
    
        left_neighbour = left_neighbour.loc[test[col].isna(), neighbour_quarter[0] + feature]
        right_neighbour = right_neighbour.loc[test[col].isna(), neighbour_quarter[1] + feature]

        test.loc[test[col].isna(), col] = (right_neighbour + (2 *left_neighbour))/3
       
        

In [21]:
num_cols = [col for col in train.columns if train[col].dtype != "O"]

for col in num_cols:
    if train[col].isna().sum() > 0:
        print(f"{col} = {train[col].isna().sum()}")

fullTimeEmployees = 80
auditRisk = 493
boardRisk = 493
compensationRisk = 493
shareHolderRightsRisk = 493
overallRisk = 493
trailingPE = 909
forwardPE = 194
floatShares = 26
sharesOutstanding = 23
trailingEps = 47
forwardEps = 194
targetHighPrice = 307
targetLowPrice = 307
targetMeanPrice = 307
targetMedianPrice = 307
recommendationMean = 224
numberOfAnalystOpinions = 307
totalCash = 27
totalCashPerShare = 57
ebitda = 54
totalDebt = 64
totalRevenue = 50
revenuePerShare = 63
freeCashflow = 33
operatingCashflow = 29
revenueGrowth = 112
Q1_TOTAL_ASSETS = 1
Q1_TOTAL_CURRENT_ASSETS = 10
Q1_TOTAL_NONCURRENT_ASSETS = 11
Q1_TOTAL_CURRENT_LIABILITIES = 13
Q1_TOTAL_NONCURRENT_LIABILITIES = 13
Q1_NET_INCOME = 230
Q1_GROSS_PROFIT = 7
Q1_COST_OF_REVENUES = 19
Q1_REVENUES = 19
Q1_OPERATING_INCOME = 7
Q1_OPERATING_EXPENSES = 8
Q1_EBITDA = 8
Q1_DEPRECIATION_AND_AMORTIZATION = 8
Q10_TOTAL_ASSETS = 178
Q10_TOTAL_CURRENT_ASSETS = 186
Q10_TOTAL_NONCURRENT_ASSETS = 189
Q10_TOTAL_LIABILITIES = 175
Q10_TOTAL

In [22]:
for col in train.columns[train.isna().sum() > 0]:
    feature_median = train[col].median()
    train[col] = train[col].fillna(feature_median)
    test[col] = test[col].fillna(feature_median)

In [23]:
print(f"misisng values in train data = {train.columns[train.isna().sum() > 0]}")
print(f"misisng values in test data = {test.columns[test.isna().sum() > 0]}")

misisng values in train data = Index([], dtype='object')
misisng values in test data = Index(['Q1_TOTAL_LIABILITIES', 'Q1_TOTAL_LIABILITIES_AND_EQUITY',
       'Q1_TOTAL_STOCKHOLDERS_EQUITY'],
      dtype='object')


In [24]:
for col in test.columns[test.isna().sum() > 0]:
    feature_median = train[col].median()
    # train[col] = train[col].fillna(feature_median)
    test[col] = test[col].fillna(feature_median)

In [25]:
print(f"misisng values in train data = {train.columns[train.isna().sum() > 0]}")
print(f"misisng values in test data = {test.columns[test.isna().sum() > 0]}")

misisng values in train data = Index([], dtype='object')
misisng values in test data = Index([], dtype='object')


In [26]:
cat_cols = [col for col in train.columns if train[col].dtype == "O" or train[col].nunique() <= 30]

## Model

In [27]:
X = train.drop(target_cols, axis = 1).copy()
y = train[target_cols].copy()

In [None]:
xgb = XGBRegressor(n_jobs = -1)
rf = RandomForestRegressor(n_jobs= -1)
train_preds = {}
test_preds = {}
print('Cross validation R2 scores for each target:\n')
cross_val_score_results = {}
for target in y:

    encoder = TargetEncoder(cols = cat_cols)
    encoder.fit(X, y[target])
    X_target = encoder.transform(X)
    test_target = encoder.transform(test)

    scaler = StandardScaler()
    scaler.fit(X_target)

    X_train_sca = scaler.transform(X_target)
    test_target = scaler.transform(test_target)

    if target in ["Q0_TOTAL_LIABILITIES", "Q0_EBITDA" ,"Q0_GROSS_PROFIT", "Q0_COST_OF_REVENUES", "Q0_OPERATING_EXPENSES"]:
        score = np.round(np.mean(cross_val_score(rf, X_train_sca, y[target], cv=3, scoring='r2')),2)
    else:
        score = np.round(np.mean(cross_val_score(xgb, X_train_sca, y[target], cv=3, scoring='r2')),2)
    cross_val_score_results[target] = score
    
    if target in ["Q0_TOTAL_LIABILITIES", "Q0_EBITDA" ,"Q0_GROSS_PROFIT", "Q0_COST_OF_REVENUES", "Q0_OPERATING_EXPENSES"]:
        rf = RandomForestRegressor(n_jobs = -1)
        rf.fit(X_train_sca, y[target])
        train_preds[target] = rf.predict(X_train_sca)
        test_preds[target] = rf.predict(test_target)
    else:
        xgb = XGBRegressor(n_jobs = -1)
        xgb.fit(X_train_sca, y[target])
        train_preds[target] = xgb.predict(X_train_sca)
        test_preds[target] = xgb.predict(test_target)
    print(f'{target} r2_score -> {score}')
    
print(f'\nMean R2 score across all targets: {np.mean(list(cross_val_score_results.values()))}')

Cross validation R2 scores for each target:

Q0_TOTAL_ASSETS r2_score -> 0.75
Q0_TOTAL_LIABILITIES r2_score -> 0.67
Q0_TOTAL_STOCKHOLDERS_EQUITY r2_score -> 0.9


In [None]:
submission = pd.read_csv("/kaggle/input/financial-performance-prediction/sample_submission.csv")

for target in test_preds:
    submission[target] = test_preds[target]    

submission.to_csv('submission.csv', index = False)