In [1]:
import numpy as np
import pandas as pd
import yfinance as yf
from datetime import datetime

In [2]:
# import seaborn and matplotlib for charting and data visualisation
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('ggplot')

In [3]:
#import plotly and cufflinks for interactive visualisations
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import cufflinks as cf
init_notebook_mode(connected=True)
cf.go_offline()

In [4]:
# To import csv file containing fundamental data downloaded from S&P Capital IQ, as well as other features and the labels for machine learning.
stockdata = pd.read_csv('cleaned stockdata.csv', index_col=0)

In [5]:
stockdata=pd.get_dummies(data=stockdata, columns=['Sector', 'Country.1','IQ_AUDITOR_OPINION'], drop_first=True, dummy_na=True).drop(['Country.1_nan','IQ_AUDITOR_OPINION_nan'],axis=1)

In [6]:
#To create a categorical variable based on effective annual return.
stockdata['Positive or negative returns'] = [1 if x >= 0 else 0 for x in stockdata['Effective annual return']]

# Decision Tree

In [87]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(stockdata.drop(['Symbol','longName','Name','IPO Year','Effective annual return','CIQ ID', 'Ticker', 'Short Business Description','Positive or negative returns'],axis=1), 
                                                    stockdata['Positive or negative returns'], test_size=0.30, 
                                                    random_state=101)

In [88]:
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier()
dtree.fit(X_train,y_train)

DecisionTreeClassifier()

In [89]:
predictions = dtree.predict(X_test)

In [90]:
from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.42      0.46      0.44       205
           1       0.82      0.80      0.81       647

    accuracy                           0.72       852
   macro avg       0.62      0.63      0.63       852
weighted avg       0.73      0.72      0.72       852



In [91]:
print(confusion_matrix(y_test,predictions))

[[ 94 111]
 [128 519]]


In [97]:
#Remove the dummy variables for Country.1 except for Country.1_United States
X_train, X_test, y_train, y_test = train_test_split(pd.concat([stockdata.iloc[:,8:48],stockdata['Country.1_United States'],stockdata.iloc[:,95:99]],axis=1), 
                                                    stockdata['Positive or negative returns'], test_size=0.30, 
                                                    random_state=101)

In [98]:
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier()
dtree.fit(X_train,y_train)

DecisionTreeClassifier()

In [99]:
predictions = dtree.predict(X_test)

In [100]:
from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.41      0.44      0.42       205
           1       0.82      0.80      0.81       647

    accuracy                           0.71       852
   macro avg       0.61      0.62      0.62       852
weighted avg       0.72      0.71      0.71       852



In [101]:
print(confusion_matrix(y_test,predictions))

[[ 90 115]
 [131 516]]


# Random Forest

In [107]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(stockdata.drop(['Symbol','longName','Name','IPO Year','Effective annual return','CIQ ID', 'Ticker', 'Short Business Description','Positive or negative returns'],axis=1), 
                                                    stockdata['Positive or negative returns'], test_size=0.30, 
                                                    random_state=101)

In [108]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train, y_train)

RandomForestClassifier()

In [109]:
rfc_pred = rfc.predict(X_test)

In [110]:
print(classification_report(y_test,rfc_pred))

              precision    recall  f1-score   support

           0       0.67      0.35      0.46       205
           1       0.82      0.95      0.88       647

    accuracy                           0.80       852
   macro avg       0.75      0.65      0.67       852
weighted avg       0.78      0.80      0.78       852



In [111]:
print(confusion_matrix(y_test,rfc_pred))

[[ 71 134]
 [ 35 612]]


In [126]:
#Remove the dummy variables for Country.1 except for Country.1_United States
X_train, X_test, y_train, y_test = train_test_split(pd.concat([stockdata.iloc[:,8:48],stockdata['Country.1_United States'],stockdata.iloc[:,95:99]],axis=1), 
                                                    stockdata['Positive or negative returns'], test_size=0.30, 
                                                    random_state=101)

In [127]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train, y_train)

RandomForestClassifier()

In [128]:
rfc_pred = rfc.predict(X_test)

In [129]:
print(classification_report(y_test,rfc_pred))

              precision    recall  f1-score   support

           0       0.66      0.35      0.46       205
           1       0.82      0.94      0.88       647

    accuracy                           0.80       852
   macro avg       0.74      0.65      0.67       852
weighted avg       0.78      0.80      0.78       852



In [130]:
print(confusion_matrix(y_test,rfc_pred))

[[ 72 133]
 [ 37 610]]


**Recursive Feature Elimination**

In [139]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(stockdata.drop(['Symbol','longName','Name','IPO Year','Effective annual return','CIQ ID', 'Ticker', 'Short Business Description','Positive or negative returns'],axis=1), 
                                                    stockdata['Positive or negative returns'], test_size=0.30, 
                                                    random_state=101)

In [140]:
# Half of the features will be selected by default, by eliminating the least important feature after recursively training.
from sklearn.feature_selection import RFE

rfe_selector = RFE(estimator=RandomForestClassifier(), step = 1)
rfe_selector.fit(X_train, y_train)
pd.DataFrame(X_train,columns=stockdata.drop(['Symbol','longName','Name','IPO Year','Effective annual return','CIQ ID', 'Ticker', 'Short Business Description','Positive or negative returns'],axis=1).columns).columns[rfe_selector.get_support()]

Index(['IQ_TOTAL_REV', 'IQ_RETURN_ASSETS', 'IQ_RETURN_EQUITY',
       'IQ_GROSS_MARGIN', 'IQ_EBITDA_MARGIN', 'IQ_NI_MARGIN', 'IQ_ASSET_TURNS',
       'IQ_CURRENT_RATIO', 'IQ_DAYS_SALES_OUT', 'IQ_DAYS_INVENTORY_OUT',
       'IQ_DAYS_PAYABLE_OUT', 'IQ_TOTAL_DEBT_EQUITY',
       'IQ_TOTAL_REV_1YR_ANN_GROWTH', 'IQ_NI_1YR_ANN_GROWTH',
       'IQ_CFO_1YR_ANN_GROWTH', 'IQ_TOTAL_ASSETS_1YR_ANN_GROWTH', 'IQ_EBITDA',
       'IQ_PAYOUT_RATIO', 'IQ_CASH_ST_INVEST', 'IQ_NPPE', 'IQ_GW', 'IQ_RE',
       'IQ_TBV', 'IQ_NET_DEBT', 'IQ_CONTINGENT_LIABILITIES', 'IQ_CASH_OPER',
       'IQ_CASH_INVEST', 'IQ_NET_CHANGE', 'Sector_Consumer Discretionary',
       'Sector_Energy', 'Sector_Finance', 'Sector_Health Care',
       'Sector_Industrials', 'Sector_Real Estate', 'Sector_Technology',
       'Sector_Telecommunications', 'Sector_Utilities', 'Sector_nan',
       'Country.1_Canada', 'Country.1_China', 'Country.1_Greece',
       'Country.1_United States', 'IQ_AUDITOR_OPINION_Unqualified',
       'IQ_AUDITOR_OP

In [141]:
X_train, X_test, y_train, y_test = train_test_split(stockdata[['IQ_TOTAL_REV', 'IQ_RETURN_ASSETS', 'IQ_RETURN_EQUITY',
       'IQ_GROSS_MARGIN', 'IQ_EBITDA_MARGIN', 'IQ_NI_MARGIN', 'IQ_ASSET_TURNS',
       'IQ_CURRENT_RATIO', 'IQ_DAYS_SALES_OUT', 'IQ_DAYS_INVENTORY_OUT',
       'IQ_DAYS_PAYABLE_OUT', 'IQ_TOTAL_DEBT_EQUITY',
       'IQ_TOTAL_REV_1YR_ANN_GROWTH', 'IQ_NI_1YR_ANN_GROWTH',
       'IQ_CFO_1YR_ANN_GROWTH', 'IQ_TOTAL_ASSETS_1YR_ANN_GROWTH', 'IQ_EBITDA',
       'IQ_PAYOUT_RATIO', 'IQ_CASH_ST_INVEST', 'IQ_NPPE', 'IQ_GW', 'IQ_RE',
       'IQ_TBV', 'IQ_NET_DEBT', 'IQ_CONTINGENT_LIABILITIES', 'IQ_CASH_OPER',
       'IQ_CASH_INVEST', 'IQ_NET_CHANGE', 'Sector_Consumer Discretionary',
       'Sector_Energy', 'Sector_Finance', 'Sector_Health Care',
       'Sector_Industrials', 'Sector_Real Estate', 'Sector_Technology',
       'Sector_Telecommunications', 'Sector_Utilities', 'Sector_nan',
       'Country.1_Canada', 'Country.1_Greece', 'Country.1_Israel',
       'Country.1_United States', 'IQ_AUDITOR_OPINION_Unqualified',
       'IQ_AUDITOR_OPINION_Unqualified Going Concern',
       'IQ_AUDITOR_OPINION_Unqualified with Additional Language']], 
                                                    stockdata['Positive or negative returns'], test_size=0.30, 
                                                    random_state=101)

In [142]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train, y_train)

RandomForestClassifier()

In [143]:
rfc_pred = rfc.predict(X_test)

In [144]:
print(classification_report(y_test,rfc_pred))

              precision    recall  f1-score   support

           0       0.65      0.35      0.46       205
           1       0.82      0.94      0.88       647

    accuracy                           0.80       852
   macro avg       0.74      0.65      0.67       852
weighted avg       0.78      0.80      0.78       852



In [145]:
print(confusion_matrix(y_test,rfc_pred))

[[ 72 133]
 [ 38 609]]


In [147]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(stockdata.drop(['Symbol','longName','Name','IPO Year','Effective annual return','CIQ ID', 'Ticker', 'Short Business Description','Positive or negative returns'],axis=1), 
                                                    stockdata['Positive or negative returns'], test_size=0.30, 
                                                    random_state=101)

In [148]:
# 10 features to be selected.
from sklearn.feature_selection import RFE

rfe_selector = RFE(estimator=RandomForestClassifier(), step = 1, n_features_to_select=10)
rfe_selector.fit(X_train, y_train)
pd.DataFrame(X_train,columns=stockdata.drop(['Symbol','longName','Name','IPO Year','Effective annual return','CIQ ID', 'Ticker', 'Short Business Description','Positive or negative returns'],axis=1).columns).columns[rfe_selector.get_support()]

Index(['IQ_TOTAL_REV', 'IQ_RETURN_ASSETS', 'IQ_EBITDA_MARGIN',
       'IQ_TOTAL_REV_1YR_ANN_GROWTH', 'IQ_TOTAL_ASSETS_1YR_ANN_GROWTH',
       'IQ_EBITDA', 'IQ_RE', 'IQ_TBV', 'IQ_NET_DEBT', 'IQ_CASH_OPER'],
      dtype='object')

In [150]:
X_train, X_test, y_train, y_test = train_test_split(stockdata[['IQ_TOTAL_REV', 'IQ_RETURN_ASSETS', 'IQ_EBITDA_MARGIN',
       'IQ_TOTAL_REV_1YR_ANN_GROWTH', 'IQ_TOTAL_ASSETS_1YR_ANN_GROWTH',
       'IQ_EBITDA', 'IQ_RE', 'IQ_TBV', 'IQ_NET_DEBT', 'IQ_CASH_OPER']], 
                                                    stockdata['Positive or negative returns'], test_size=0.30, 
                                                    random_state=101)

In [151]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train, y_train)

RandomForestClassifier()

In [152]:
rfc_pred = rfc.predict(X_test)

In [153]:
print(classification_report(y_test,rfc_pred))

              precision    recall  f1-score   support

           0       0.58      0.25      0.35       205
           1       0.80      0.94      0.86       647

    accuracy                           0.78       852
   macro avg       0.69      0.60      0.61       852
weighted avg       0.75      0.78      0.74       852



In [154]:
print(confusion_matrix(y_test,rfc_pred))

[[ 52 153]
 [ 38 609]]


It seems with less features, the random forest performs worse. No further feature selection is performed.

**Decision Tree and Random Forest model metrics**

|**Features**|**Accuracy**|**Weight avg f1-score**|
|---------------------------|-----|---|
|Decision tree with all 91 features including all dummy variables|0.72|0.72|
|Decision tree With Country.1 dummy variables removed except Country.1_United States (45 features)|0.71|0.71|
|Random forest with all 91 features including all dummy variables|**0.80**|**0.78**|
|Random forest with Country.1 dummy variables removed except Country.1_United States (45 features)|**0.80**|**0.78**|
|Random forest with recursive feature elimination with all dummy variables (45 features)|**0.80**|**0.78**|
|Random forest with recursive feature elimination with all dummy variables (10 features)|0.78|0.74|