**Import all the required libraries**

In [1]:
import numpy as np 
import pandas as pd 

from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
import statistics

**The data consists of certain data points such as GrossMargin,WorkingCapitalRatio,EarningPerShare for all the companies. Using these data points, we calculate a variety ratios using mathematical formulae based on the data points. Finally, we have labels (with value 0 or 1) corresponding to these ratios depending on their actual value in comparison to an ideal value it should have.**

**Import the data, and create a column 'aggregate' which stores the mean of all the label columns**

In [2]:
data = pd.read_csv("company_data.csv")
data = data.set_index('cik_date')
data[['wce_label','eps_label','de_label','pe_label','roe_label','growth_rate_label','profitm_label','grossm_label','ro40_label','churnrate_label','EVbyEbidta_label','marketCap_label','magicNum_label']] = data[['wce_label','eps_label','de_label','pe_label','roe_label','growth_rate_label','profitm_label','grossm_label','ro40_label','churnrate_label','EVbyEbidta_label','marketCap_label','magicNum_label']].fillna(0)
data['aggregate'] = data[['wce_label','eps_label','de_label','pe_label','roe_label','growth_rate_label','profitm_label','grossm_label','ro40_label','churnrate_label','EVbyEbidta_label','marketCap_label','magicNum_label']].mean(axis = 1, skipna = True)

**We assign average value of the column to all the missing data points**

In [3]:
avg_values = data.mean(axis = 0, skipna = True).fillna(0).to_dict()
for key in data.keys():
  data[key] = data[key].fillna(avg_values[key])

***For training our model, the input features are the data points and calculated ratios, while the output feature is the 'aggregate' column which is the sum of all the label columns***

In [4]:
X = data[['GrossProfit','GrossMargin','WorkingCapitalRatio','EarningPerShare','DebtToEquityRatio','PEratio','ReturnOfEquity','EBIDTAratio','EvRatio','EVbyEbidta','ChurnRate','GrowthRate','ProfitMargin','RuleOf40','MarketCap','MagicNumber']]
y = data['aggregate']

In [5]:
print(statistics.mean(list(y)))

0.29213483146067415


***Split the dataset into train and test***

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [7]:
X_train.shape , X_test.shape , y_train.shape , y_test.shape

((124, 16), (54, 16), (124,), (54,))

**We perform regression on the 'aggregate' value**

In [8]:
from xgboost import XGBRegressor

xgb_regressor = XGBRegressor(n_estimators=1000, max_depth=7, eta=0.1, subsample=0.7, colsample_bytree=0.8)

xgb_regressor.fit(X_train,y_train)
xgb_test_preds = xgb_regressor.predict(X_test)



In [9]:
from sklearn.metrics import mean_squared_error
error = mean_squared_error(list(y_test), xgb_test_preds)
print(error)

0.002971415952212202


In [10]:
final_preds = xgb_regressor.predict(X)
data['predicted_aggregate'] = final_preds
data.to_csv('predicted_company_scores.csv')

In [11]:
import pickle
with open('xgbmodel_regression.pkl', 'wb') as fid:
    pickle.dump(xgb_regressor,fid,2) 

**Based on the output of the regression model, we generate labels 0,1,2 using some threshold values**

In [19]:
def findLabel(x):
  if x>=0 and x<=0.1538:
    return 0
  elif x>0.15 and x<=0.3556:
    return 1
  else:
    return 2

In [20]:
test_labels = list()
pred_labels = list()

In [21]:
count = 0
for ind in range(len(list(y_test.to_dict().values()))):
  test_score = y_test[ind]
  pred_score = xgb_test_preds[ind]
  test_labels.append(findLabel(test_score))
  pred_labels.append(findLabel(pred_score))
  count+=1
print(count)

54


In [22]:
print(test_labels)

[0, 1, 2, 0, 2, 2, 0, 1, 2, 2, 2, 0, 1, 2, 2, 1, 0, 2, 0, 2, 2, 1, 0, 0, 0, 0, 2, 1, 1, 1, 0, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 0, 2, 0, 1, 1, 2, 1]


In [23]:
print(pred_labels)

[0, 1, 2, 0, 1, 2, 0, 1, 1, 1, 2, 0, 1, 1, 2, 1, 0, 2, 0, 2, 2, 1, 0, 0, 0, 0, 2, 1, 1, 1, 0, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 0, 2, 0, 0, 1, 2, 1]


In [24]:
print(len(test_labels),len(pred_labels))

54 54


**We calculate the final labels using the regression outputs and the threshold value**

In [25]:
from sklearn.metrics import accuracy_score , precision_score, recall_score, f1_score 

accuracy = accuracy_score(test_labels,pred_labels)
precision = precision_score(test_labels,pred_labels,average=None)
recall = recall_score(test_labels,pred_labels,average=None)
f1 = f1_score(test_labels,pred_labels,average=None)


print(f"accuracy: {accuracy}")
print(f"precision_score: {precision}")
print(f"recall_score: {recall}")
print(f"f1_score: {f1}")

accuracy: 0.8888888888888888
precision_score: [0.92857143 0.81481481 1.        ]
recall_score: [1.         0.95652174 0.72222222]
f1_score: [0.96296296 0.88       0.83870968]
