# PYTHON FUNDAMENTAL STOCK ANALYSIS USING UNSUPERVISED MACHINE LEARNING REGRESSION MODEL

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from numpy import mean
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
import sqlalchemy
sqlalchemy.__version__
import sys
sys.version

In [None]:
import pandas as pd
import numpy as np
SAMPLEDATA=pd.read_csv(filepath_or_buffer="G:/IVY/data/MACHINE_LEARNING/stock.csv", 
                              sep=',', encoding='latin-1')
print(type(SAMPLEDATA))
pd.set_option

pd.options.display.max_columns=50
pd.options.display.max_rows=100
print('Shape before deleting duplicate values:', SAMPLEDATA.shape)

SAMPLEDATA=SAMPLEDATA.drop_duplicates()
print('Shape After deleting duplicate values:', SAMPLEDATA.shape)

SAMPLEDATA.head(10)

# Defining the problem statement: 
* Target Variable: Graham Price
* Predictors: Intrinsic Value, Book Value, EPS, CMP

### Looking at the distribution of Target variable
* If target variable's distribution is too skewed then the predictive modeling will not be possible.
* Bell curve is desirable but slightly positive skew or negative skew is also fine

In [None]:
%matplotlib inline
# Creating Bar chart as the Target variable is Continuous
SAMPLEDATA['Graham Price'].hist(figsize=(5,4),color='Orange',bins=10)
plt.show()

In [None]:
SAMPLEDATA.describe(include='all')
print("No of null Values per Column :- ")
print(SAMPLEDATA.isnull().sum())
print("***************************************************************")
print("No of Unique Values per Column :- ")
SAMPLEDATA.nunique()
SAMPLEinfo=SAMPLEDATA.info()
SAMPLEDATA.columns 

## missing value treatment

In [None]:
SAMPLEDATA['Intrinsic Value'].fillna(value=SAMPLEDATA['Intrinsic Value'].median(), inplace=True) # Continious variable
SAMPLEDATA['Graham Price'].fillna(value=SAMPLEDATA['Graham Price'].median(), inplace=True) # Continious variable
print(SAMPLEDATA.isnull().sum())

### FEATURE ENGINEARING

In [None]:
SAMPLEDATA['P/B Ratio']=SAMPLEDATA['CMP']/SAMPLEDATA['Book Value']
SAMPLEDATA['P/E Ratio']=SAMPLEDATA['CMP']/SAMPLEDATA['EPS ']
SAMPLEDATA['Debt/Asset Ratio']=SAMPLEDATA['Debt']/SAMPLEDATA['Assets']
SAMPLEDATA

# DATA INTERPRETATIONS

In [None]:
# FOR CATEGORICAL VARIABLES
def PlotBarCharts(inpData, colsToPlot):
    %matplotlib inline
    
    import matplotlib.pyplot as plt
    
    # Generating multiple subplots
    fig, subPlot=plt.subplots(nrows=1, ncols=len(colsToPlot), figsize=(20,5))
    fig.suptitle('Bar charts of: '+ str(colsToPlot))

    for colName, plotNumber in zip(colsToPlot, range(len(colsToPlot))):
        inpData.groupby(colName).size().plot(kind='bar',color='green',ax=subPlot[plotNumber])

# Calling the Function
PlotBarCharts(inpData=SAMPLEDATA, colsToPlot=['G Factor', 'Piotski Scrore'])

In [None]:
# Grouping the SibSP values 3,4,5,8 in one single bucket as 3
SAMPLEDATA['G Factor'][SAMPLEDATA['G Factor']>=8]=8
PlotBarCharts(inpData=SAMPLEDATA, colsToPlot=['G Factor', 'Piotski Scrore'])

# exporting the image to the local disk
plt.show()
plt.savefig('Column Charts.png')

In [None]:
# FOR CONTINIOUS VARIABLES
SAMPLEDATA.hist(['Altman Z Scr', 'Enterprise Value',
       'Intrinsic Value', 'Sales', 'Debt', 'Assets', 'Working Capital',
       'Book Value', 'Capital Employed', 'Piotski Scrore', 'Leverage',
       'Reserves', 'EPS ','CMP','P/B Ratio', 'P/E Ratio', 'Debt/Asset Ratio'], figsize=(18,15),color='red')

# exporting the image to the local disk
plt.show()
plt.savefig('Histogram Charts.png')

## Visual exploration of relationship between variables
* Continuous Vs Continuous ---- Scatter Plot
* Categorical Vs Continuous---- Box Plot
* Categorical Vs Categorical---- Grouped Bar Plots

## Statistical measurement of relationship strength between variables
* Continuous Vs Continuous ---- Correlation matrix
* Categorical Vs Continuous---- ANOVA test
* Categorical Vs Categorical--- Chi-Square test

In [None]:
ContinuousCols=['Altman Z Scr', 'Enterprise Value',
       'Intrinsic Value', 'Sales', 'Debt', 'Assets', 'Working Capital',
       'Book Value', 'Capital Employed', 'Piotski Scrore', 'Leverage',
       'Reserves', 'EPS ','CMP','P/B Ratio', 'P/E Ratio', 'Debt/Asset Ratio']

# Plotting scatter chart for each predictor vs the target variable
for predictor in ContinuousCols:
    SAMPLEDATA.plot.scatter(x=predictor, y='Graham Price', figsize=(5,2), title=predictor+" VS "+ 'Graham Price')

# exporting the image to the local disk
plt.show()
plt.savefig('Scatter Charts.png')

In [None]:
# Calculating correlation matrix
ContinuousCols1=['Altman Z Scr', 'Enterprise Value', 'Graham Price',
       'Intrinsic Value', 'Sales', 'Debt', 'Assets', 'Working Capital',
       'Book Value', 'Capital Employed', 'Piotski Scrore', 'Leverage',
       'Reserves', 'EPS ','CMP','P/B Ratio', 'P/E Ratio', 'Debt/Asset Ratio']

# Creating the correlation matrix
CorrelationData=SAMPLEDATA[ContinuousCols1].corr()
CorrelationData

In [None]:
# Filtering only those columns where absolute correlation > 0.5 with Target Variable
# reduce the 0.5 threshold if no variable is selected
CorrelationData['Graham Price'][abs(CorrelationData['Graham Price']) > 0.5]

In [None]:
abs(CorrelationData['Graham Price'])>0.5

In [None]:
# Box plots for Categorical Target Variable  and continuous predictors
CategoricalColsList=['G Factor', 'Piotski Scrore']

import matplotlib.pyplot as plt
fig, PlotCanvas=plt.subplots(nrows=1, ncols=len(CategoricalColsList), figsize=(20,5))

# Creating box plots for each continuous predictor against the Target Variable 
for PredictorCol , i in zip(CategoricalColsList, range(len(CategoricalColsList))):
    SAMPLEDATA.boxplot(column='Enterprise Value', by=PredictorCol, figsize=(10,10), vert=True, ax=PlotCanvas[i])

# exporting the image to the local disk
plt.show()
plt.savefig('Box Charts.png')

# Statistical Feature Selection (Categorical Vs Continuous) using ANOVA test

In [None]:
# Defining a function to find the statistical relationship with all the categorical variables
def FunctionAnova(inpData, TargetVariable, CategoricalPredictorList):
    from scipy.stats import f_oneway

    # Creating an empty list of final selected predictors
    SelectedPredictors=[]
    
    print('##### ANOVA Results ##### \n')
    for predictor in CategoricalPredictorList:
        CategoryGroupLists=inpData.groupby(predictor)[TargetVariable].apply(list)
        AnovaResults = f_oneway(*CategoryGroupLists)
        
        # If the ANOVA P-Value is <0.05, that means we reject H0
        if (AnovaResults[1] < 0.05):
            print(predictor, 'is correlated with', TargetVariable, '| P-Value:', AnovaResults[1])
            SelectedPredictors.append(predictor)
        else:
            # Accepting the H0 if the P value is more than 0.05
            print(predictor, 'is NOT correlated with', TargetVariable, '| P-Value:', AnovaResults[1])
    
    return(SelectedPredictors)

# Selecting final predictors for Machine Learning
Based on the above tests, selecting the final columns for machine learning

In [None]:
SelectedColumns=['Intrinsic Value','Book Value','EPS ','CMP','Graham Price']

# Selecting final columns
DataForML=SAMPLEDATA[SelectedColumns]
DataForML.head()

In [None]:
# Saving this final data for reference during deployment
DataForML.to_pickle('DataForML.pkl')

In [None]:
# Reading a pickle file
DataForML=pd.read_pickle('DataForML.pkl')
DataForML.head()

## Data Pre-processing for Machine Learning
List of steps performed on predictor variables before data can be used for machine learning
1. Converting each Ordinal Categorical columns to numeric
2. Converting Binary nominal Categorical columns to numeric using 1/0 mapping
3. Converting all other nominal categorical columns to numeric using pd.get_dummies()
4. Data Transformation (Optional): Standardization/Normalization/log/sqrt. Important if you are using distance based algorithms like KNN, or Neural Networks

In [None]:
# Treating all the nominal variables at once using dummy variables
DataForML_Numeric=pd.get_dummies(DataForML)

# Adding Target Variable to the data
DataForML_Numeric['Graham Price']=SAMPLEDATA['Graham Price']

# Printing sample rows
DataForML_Numeric.head()

In [None]:
# Separate Target Variable and Predictor Variables
TargetVariable='Graham Price'
Predictors=['Intrinsic Value','Book Value','EPS ','CMP']

# Simple Linear Regression (only one predictor)
# Predictors=['']

X=DataForML_Numeric[Predictors].values
y=DataForML_Numeric[TargetVariable].values

# Polynomial Regression
# Uncomment below lines if you want to perform polynomial regression
#from sklearn.preprocessing import PolynomialFeatures
#poly = PolynomialFeatures(degree = 2, include_bias=False)
#X = poly.fit_transform(X)
#Predictors=poly.get_feature_names()

# Split the data into training and testing set
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=41)

# Quick check on the shapes of train and test
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
# Multiple Linear Regression
from sklearn.linear_model import LinearRegression
RegModel = LinearRegression()

# Printing all the parameters of Linear regression
print(RegModel)

# Creating the model on Training Data
LREG=RegModel.fit(X_train,y_train)

# Taking the standardized values to original scale


from sklearn import metrics
# Measuring Goodness of fit in Training data
print('R2 Value:',metrics.r2_score(y_train, LREG.predict(X_train)))

###########################################################################
print('\n##### Model Validation and Accuracy Calculations ##########')

# Printing some sample values of prediction
prediction=LREG.predict(X_test)
TestingDataResults=pd.DataFrame(data=X_test, columns=Predictors)
TestingDataResults[TargetVariable]=y_test
TestingDataResults[('Predicted'+TargetVariable)]=np.round(prediction)
print(TestingDataResults.head())

# Calculating the error for each row
TestingDataResults['APE']=100 * ((abs(
  TestingDataResults['Graham Price']-TestingDataResults['PredictedGraham Price']))/TestingDataResults['Graham Price'])

# Printing sample prediction values
print(TestingDataResults[[TargetVariable,'Predicted'+TargetVariable, 'APE']].head())


MAPE=np.mean(TestingDataResults['APE'])
MedianMAPE=np.median(TestingDataResults['APE'])

Accuracy =100 - MAPE
MedianAccuracy=100- MedianMAPE
print('Mean Accuracy on test data:', Accuracy) # Can be negative sometimes due to outlier
print('Median Accuracy on test data:', MedianAccuracy)


# Defining a custom function to calculate accuracy
# Make sure there are no zeros in the Target variable if you are using MAPE
def Accuracy_Score(orig,pred):
    MAPE = np.mean(100 * (np.abs(orig-pred)/orig))
    #print('#'*70,'Accuracy:', 100-MAPE)
    return(100-MAPE)

# Custom Scoring MAPE calculation
from sklearn.metrics import make_scorer
custom_Scoring=make_scorer(Accuracy_Score, greater_is_better=True)

# Importing cross validation function from sklearn
from sklearn.model_selection import cross_val_score

# Running 10-Fold Cross validation on a given algorithm
# Passing full data X and y because the K-fold will split the data and automatically choose train/test
Accuracy_Values=cross_val_score(RegModel, X , y, cv=5, scoring=custom_Scoring)
print('\nAccuracy values for 10-fold Cross Validation:\n',Accuracy_Values)
print('\nFinal Average Accuracy of the model:', round(Accuracy_Values.mean(),2))

## MULTIPLE LINNEAR REGRESSION

In [None]:
# Visualizing the line of best fit
%matplotlib inline
import matplotlib.pyplot as plt
plt.scatter(x=TestingDataResults['CMP'] , y=TestingDataResults['Graham Price'])
plt.scatter(TestingDataResults['CMP'] ,TestingDataResults['PredictedGraham Price'], color='red')

In [None]:
SelectedColumns=['Intrinsic Value','Book Value','EPS ','CMP']
# Looking at the coefficients for each column (M Value)
LREG.coef_
# Looking at the intercept (C Value)
LREG.intercept_

### DECISION TREE

In [None]:
# Decision Trees (Multiple if-else statements!)
from sklearn.tree import DecisionTreeRegressor
RegModel = DecisionTreeRegressor(max_depth=3, criterion='squared_error')
# Good Range of hyper parameter Max_depth = 2 to 20

# Printing all the parameters of Decision Tree
print(RegModel)

# Creating the model on Training Data
DT=RegModel.fit(X_train,y_train)

from sklearn import metrics
# Measuring Goodness of fit in Training data
print('R2 Value:',metrics.r2_score(y_train, DT.predict(X_train)))

# Plotting the feature importance for Top 10 most important columns
%matplotlib inline
feature_importances = pd.Series(DT.feature_importances_, index=Predictors)
feature_importances.nlargest(10).plot(kind='barh')

###########################################################################
print('\n##### Model Validation and Accuracy Calculations ##########')
prediction=DT.predict(X_test)
# Printing some sample values of prediction
TestingDataResults=pd.DataFrame(data=X_test, columns=Predictors)
TestingDataResults[TargetVariable]=y_test
TestingDataResults[('Predicted'+TargetVariable)]=np.round(prediction)

# Printing sample prediction values
print(TestingDataResults[[TargetVariable,'Predicted'+TargetVariable]].head())

# Calculating the error for each row
TestingDataResults['APE']=100 * ((abs(
  TestingDataResults['Graham Price']-TestingDataResults['PredictedGraham Price']))/TestingDataResults['Graham Price'])

MAPE=np.mean(TestingDataResults['APE'])
MedianMAPE=np.median(TestingDataResults['APE'])

Accuracy =100 - MAPE
MedianAccuracy=100- MedianMAPE
print('Mean Accuracy on test data:', Accuracy) # Can be negative sometimes due to outlier
print('Median Accuracy on test data:', MedianAccuracy)


# Defining a custom function to calculate accuracy
# Make sure there are no zeros in the Target variable if you are using MAPE
def Accuracy_Score(orig,pred):
    MAPE = np.mean(100 * (np.abs(orig-pred)/orig))
    #print('#'*70,'Accuracy:', 100-MAPE)
    return(100-MAPE)

# Custom Scoring MAPE calculation
from sklearn.metrics import make_scorer
custom_Scoring=make_scorer(Accuracy_Score, greater_is_better=True)

# Importing cross validation function from sklearn
from sklearn.model_selection import cross_val_score

# Running 10-Fold Cross validation on a given algorithm
# Passing full data X and y because the K-fold will split the data and automatically choose train/test
Accuracy_Values=cross_val_score(RegModel, X , y, cv=10, scoring=custom_Scoring)
print('\nAccuracy values for 10-fold Cross Validation:\n',Accuracy_Values)
print('\nFinal Average Accuracy of the model:', round(Accuracy_Values.mean(),2))


# exporting the image to the local disk
plt.show()
plt.savefig('Bar Charts.png')

In [None]:
# Load libraries
from IPython.display import Image
from sklearn import tree
import pydotplus

# Create DOT data
dot_data = tree.export_graphviz(RegModel, out_file=None, 
                                feature_names=Predictors, class_names=TargetVariable)
# printing the rules
print(dot_data)

# Draw graph
graph = pydotplus.graph_from_dot_data(dot_data)

# Show graph
Image(graph.create_png(), width=1200,height=1500)
# Double click on the graph to zoom in


# exporting the image to the local disk
plt.show()
plt.savefig('Decision Tree.png')

## EXPLORATORY DATA ANALYSIS

In [None]:
SelectGraham=['Name','Graham Price','CMP']
Graham=SAMPLEDATA[SelectGraham]
top_n = Graham.nlargest(10, 'Graham Price')
print(top_n)

## FINAL TESTING RESULT 

In [None]:
Selectstock=['Name','Graham Price','CMP','G Factor','Book Value']
Stock=SAMPLEDATA[Selectstock]

UndervaluedStock = Stock[Stock['Graham Price'] > Stock['CMP']]
print("Filtered rows where Graham Price> Market Value :")
print(UndervaluedStock)

## THANK YOU SO MUCH for reading the code           (This Code is writen by Somnath Banerjee)           GMail: somnathbanerjee342000@gmail.com