-
Notifications
You must be signed in to change notification settings - Fork 7
/
Program.cs~
627 lines (566 loc) · 41.8 KB
/
Program.cs~
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
using System;
using System.Collections.Generic;
using System.Linq;
using MathNet.Numerics.LinearRegression;
using MathNet.Numerics.LinearAlgebra;
using MathNet.Numerics.LinearAlgebra.Double;
using MathNet.Numerics.Distributions;
/*
* The Following Nuget Packages must be available:
* - MathNet.Numerics
*/
/*
* This is a c# implementation of (two step) the Engel-Granger Test for Cointegration.
* The test considers two stocks and analyses them for possible cointegration (up to 3rd degree).
* Data needs to be provided. The format expected is the same as from historic price data csv files downloaded from Yahoo finance.
* To use the programm simply copy the path to the respective files in to the console, when asked for it.
* The second Part of this programm simulates a Pairs Trading Strategy, for which the Engel-Granger Test is considered to be a good succes predictor -> cointegration makes profitability more likely
*/
/*
* Some inspirations for pairs to be tested, I can send the according data via email, but it is also available on Yahoo Finance (download historic data):
* - CXW and GEO -> Two american prison stocks
* - ROG.SW and NOVN.SW -> two swiss pharmaceutical giants
*/
/*
* PAIRS TRADING ANALYSIS:
* Motivation: Stock Prices could be cointegrated, that is be a function of another stock price
* A simple relation between two stocks could be to converge back to the same relative valuation
* Thus, we can express try to model the relative valuation of the two stocks by fitting simple polinomials
* to the timeseries of <Stock Price 2>/ <Stock Price 1>, given the model approximates the relation between
* the two stocks relatively well, we can classify those stocks as being relatively overvalued and being relatively undervalued,
* Opening according trade positions, long on the undervalued and short on the overvalued stock,
* the strategy should realize positive retrunrs, to find appropriate entry and exit points for these trades, the deviation of the ratio
* from the models prediction can be used
*/
/*
* Given the RatioTS, we now want to explain it, by using a constant, a linear or a quadratic polinomial,
* To evaluate what is the best model explaining the ratio we want to perform perform an ADF Test on the residuals
* Thus, testing for stationarity, the simple Dickey-Fuller Test is designed to test a Time Series generated by an
* Auto-regressive process of order one, by regressing Y_t on Y_{t-1} and performing a t-test for the significance of the intercept.
* The Adjusted DF (ADF) test, is similar however, the regression includes multiple lags.
* A low test-statistic, and thus a low p-Value indicates that the Null-Hypothesis of the residual being a unit-root process
* can be rejeceted. Therefore, indicating that both stocks are cointegrated
* Methods for Testing Cointegration: Johanssen, Engel-Granger, Phillips-Ouliaris
* Define Cointegration: test both Series for a unit-root, if unit root cannot be rejected
* test cointegration among components
*
*/
/*
* ADDITIONAL NOTE:
* Against my expectations, there is no free of charge C# package that allows for easy statistical inference, the Augmented Dickey Fuller test requires to have the estimator errors of a regression to calculate test statistics.
* Even though this sounds trivial, packages such as ML.Net do not support standard errors of estimators for multiple regressions. In fact there appear only two packages
* "Extreme Optimization" and "Center Space" that are capable of providing such functionality, in an easy way. As their licenses cost each above USD 1000 I needed to find a different soluton to this problem.
* Therefore I created the below used MultipleRegressionModel class which uses the MathNet Nuget package and its provided regression function, to calculate Multiple regressions, I added several functions to the class,
* to make basic statistics (such as r-squared and regression estimat erorrors,t-statistics and p-values) easy accessible. This was not initially planned to be part of the project and represents a major obstacle that I had to overcome!
* according to wikipedia there is no C# package providing thes functions, in opposite to most languges such as python, R or Java
* Feedback, Improvement suggestions are very welcome
* /
namespace PTA
{
class Program
{
static void Main(string[] args)
{
/* The Program aims to perform an Adjusted Dickey-Fuller (ADF) Test on the relation of two time Series. To my knowledge, there is no free implementation of the test in C#, compare with wikipedia,
* most languages such as Python or Java provide a ready to use version of the test. To solve this issue the ML.NET framework will be used to perform a regression, giving us the relevant t-statistic
*/
Console.WriteLine("Reading csv data, files in the format as downloaded from yfinance are expected!");
Console.WriteLine("Please provide the Path to your first csv file:");
string Path1;
string Path2;
//Path1 = Console.ReadLine();
Console.WriteLine("Please provide the Path to your second csv file:");
//Path2 = Console.ReadLine();
//Example how to implement fixed pahts in the program, simply assign values as below (on macOS) and comment out the Console.ReadLine commands above
Path1 = "/Users/Tobias/Downloads/CXW.csv";
Path2 = "/Users/Tobias/Downloads/GEO.csv";
//Path1 = "/Users/Tobias/Downloads/NOVN.SW.csv";
//Path2 = "/Users/Tobias/Downloads/ROG.SW.csv";
//Method to read the desired data from the csv files and for handling broke files (I tried Ml.Net csv parser but it always failed when there where missing observations in the files)
List<ParsedFile> ParseCSV(string path, string seperator = ",", bool header = true, int skip = 0)
{
var file = System.IO.File.ReadAllLines(@path);
bool warning = false;
if (header) { skip += 1; Array.Copy(file, skip, file, 0, file.Length-skip); }
List<ParsedFile> data = new List<ParsedFile>();
foreach(string row in file)
{
try {
string[] observations = row.Split(seperator);
data.Add(new ParsedFile { Date = Convert.ToDateTime(observations[0]), ClosingPrice = Convert.ToDouble(observations[4]) });
} catch
{
if (!warning)
{
Console.WriteLine($"Sometimes these files contain data that violates the expected format or that is missing. While reading in the file: '{path}' some errors occurred, the program will continue with the readable data!");
warning = true;
}
}
}
return data;
}
/*
* We cannot expect the files to contain observation for exactly the same dates!
* Therfore, we must select an according subsets of both files (thus lazy loading approaches would be inefficient to parse the files).
*/
var parsed1 = ParseCSV(Path1);
var parsed2 = ParseCSV(Path2);
DateTime[] Dates1 = ParseCSV(Path1).Select(r => r.Date).ToArray();
DateTime[] Dates2 = ParseCSV(Path2).Select(r => r.Date).ToArray();
DateTime[] DatesInBoth = Dates1.Intersect(Dates2).ToArray(); // We take the Intersect of both lists to find the dates that we can compare
// Using the Intercept, a Dictionary of Closing prices can be defined
// The Dictionaries Key is going to be two-dimensional so that Data can be accessed by a key consisting of Date and a column name (Like C1, C2 for the closing prices of the respective equities)
Dictionary<(DateTime, string), float> MergedData = new Dictionary<(DateTime, string), float>();
foreach ((DateTime Date, float ClosingPrice) in parsed1.Where(r => DatesInBoth.Contains(r.Date)).Select(r => (r.Date, (float)r.ClosingPrice))) { MergedData[(Date, "C1")] = ClosingPrice; }
foreach ((DateTime Date, float ClosingPrice) in parsed2.Where(r => DatesInBoth.Contains(r.Date)).Select(r => (r.Date, (float)r.ClosingPrice))) { MergedData[(Date, "C2")] = ClosingPrice; }
// Adittionally, the below helper function can access the timeseries from the Dictionary 'MergedData' by column name
float[] Column(string colname)
{
float[] values = new float[DatesInBoth.Length];
for (int i = 0; i < values.Length; i++) { values[i] = MergedData[(DatesInBoth[i], colname)]; }
return values;
}
// Note the TimeSeriesTests class is defined below, and a key component of the project
TimeSeriesTests tst = new TimeSeriesTests();
// When performing tests for lagged timeseries, we want the lag to be at least 10, but for larger timeseries we allow it to be at a maximum of 1% of the observations -> timeseries with 8000 data points would yield regressions with 80 lags
int lags = DatesInBoth.Length / 100 > 10 ? DatesInBoth.Length / 100 : 10;
// Performing the ADF test, a Timeseries will be classified to be likely stationary if the according p-Value is below 1% -> 0.01, for values <= 0.01 the programm will assume non-stationarity
float ADFp1 = tst.ADF(Column("C1"), lags);
float ADFp2 = tst.ADF(Column("C2"), lags);
bool stationary1 = ADFp1 < 0.01;
bool stationary2 = ADFp2 < 0.01;
string text1 = stationary1 ? "stationary" : "non-stationary";
string text2 = stationary2 ? "stationary" : "non-stationary";
Console.WriteLine($"\nThe Augmented Dickey-Fuller test yields a p-value of {ADFp1} for stock 1, concluding the Timeseries is {text1}.\nThe Augmented Dickey-Fuller test yields a p-value of {ADFp2} for stock 2, concluding the Timeseries is {text2}.");
bool simulateTrades = true;
// Analysing for the cointegration
if (!(stationary1 | stationary1))
{
// Both Timeseries are non-stationary, the next step is to Identify their order of Cointegration, this program will test up to the 3rd order of cointegration.
// Higher degrees could be tested by adjusting the parameter below but would be difficult to interpret
Console.WriteLine("\nPerforming the Engle-Granger Test to esitamte whether the stocks might be cointegrated... ");
double[] series1 = Column("C1").Select(x => (double)x).ToArray();
double[] series2 = Column("C2").Select(x => (double)x).ToArray();
int DegreeOfCointegration = tst.EngelGrangerDegreeOfCointegration(Column("C1").Select(x => (double)x).ToArray(), Column("C2").Select(x => (double)x).ToArray(), 3, lags);
if (DegreeOfCointegration>=1 & DegreeOfCointegration<=3)
{
Console.WriteLine($"The two Stocks are Cointegrated. Their degree of cointegration is {DegreeOfCointegration}!");
if (DegreeOfCointegration == 1)
{
Console.WriteLine("This indicates trading the Pairs Trading strategy could yield high returns for those two stocks!");
} else if (DegreeOfCointegration == 2)
{
Console.WriteLine("This indicates trading the Pairs Trading strategy could yield high returns, but there might be more profitable pairs to trade!");
} else
{
Console.WriteLine("This indicates trading the Pairs Trading strategy might be profitable, but the relationship of these two stocks is rather week!");
}
}
else
{
Console.WriteLine("The two timeseries are not stationary, nor are they cointegrated of order 1, 2 or 3. Therfore, it is not advisable to try to apply the Pairs Trading strategy to this pair of Stocks!\nEnter Y to continue with the simulation anyway, otherwise the program will end:");
string answer = Console.ReadLine().ToLower();
simulateTrades = answer=="y"; // Ends the programm if the input is not y or Y
}
}
else
{
Console.WriteLine("At least one of the time series is already stationary!");
}
// Run the simulation, if desired by promt or deemed reasonable by the program
if (simulateTrades)
{
Console.WriteLine("\nPairs Trading Strategy: assuming the two stocks have a constant relationship (mean ratio), and constant variance of this relation.");
Console.WriteLine("Temporary outbreaks of the relationship (measured by standard deviation) are traded by shorting the relatively overvalued stock and by opening a long position on the relatively undervalued stock (equally weighted),");
Console.WriteLine("the positions are closed when the relative valuation of the two stocks returns to the expected value (here a constant mean).");
Console.WriteLine("Evaluating the Pairs Trading Strategy...\n");
// The programm follows the simple assumption that the relative valuation of the two stocks follows a distribution with constant (or moving -> can be defined in variables below) mean and variance (more advanced models like SARIMA or GARCH could be applied to further boost predictive power)
// Under this assumption we take the mean and std from the given data, in reality this is not accaptable as we introduce a foreward looking bias (not the case with the moving average) into out model, a better approach would be to fit the model on a fraction of the given data
// and evaluate its performance on the unused observations
// Another important problem is the assumption of a constant mean in the relationship, in nearly all cases this assumption will not hold true!
// The Effect of this is dramatic, as the profitability of the strategy depends on what stock is treated by the programm as stock one or stock two (to test this, simply run the programm with flipped inputs)
// Calculate the ratio of the stocks for each observation in the MergedData Dictionary
// A high ratio means Stock1 is relativley overvalued, a low ratio means Stock 2 is relatively overvalued, a ratio close to the average of the ratios indicates a "good" relative valuation and thus the absence
float[] ratioTS = Column("C1").Zip(Column("C2"), (first, second) => (float)first / second).ToArray();
float avgRatio = ratioTS.Average();
float stdRatio = (float)Math.Sqrt(ratioTS.Select(x => Math.Pow((x - avgRatio), 2)).Sum() / (ratioTS.Length - 1));
// Now we defin the size of an actionable deviation
// Under the assumption of a normal distibution, 68% of probability mass are within 1 std from the mean and 95% within 2 std from the mean.
// we want to trade only a few and thus hopefully very profitable deviations, so our action level should be somewhere between 1 and 2, here I chose 1.8
float actionDeviation = (float)1.8; // If the ratio deviates more than 1.8 std from the mean, we will open a trade
// Assuming the portfolio starts only by holding cash
// The program itterates through the rows of the dictionary, for each date the prices of both stocks are observed and a decision to act is made,any action happens on the next day
// posible actions are 1: Short stock 1 and go long on Stock two, 2:go long on stock 1 and go short on Stock two, 3: close any open positions
// Transaction costs are ignored for this simulation, we presume we always sell and buy at the closing prices following the day of our decision (the only observed prices)
// The following block defines variables, that are crucial to the simulation
double CumReturn = 0; // Tracs the cumulative return of the strategy
bool tradeIsOpen = false; // Variable to distinguish if there is an open position or not
bool DayIsActionDay = false; // Variable to distinguish if the program decided to make an action (either open a trade or close a trade) the dy before
bool StockOneIsShort = false; // boolean to distingiush if an open position is short on stock1 or short on stock2 (implied by long on stock1)
int counter = 1; // Variable to count the days, to required to cloase any open positions on the last day observed -> to get the cumulative return over the entire period
// a variable 'z' will be the decision criterion for any actions. Depending on the variable setup below it is calculated by a static mean or by a moving average
bool useBackwardLookingMA = true; // boolean to use a moving average
int movingAverageBasis = 50; // define how many of the last observations should be used to calculate the moving average and standard deviation
List<float> lastRatios = new List<float>(); // List to be filled with values to Calculate a moving average (and a standard deviation based on this average)
double lastZ = 0; // variable to keep track of z, of the last day, to determine if the average ratio was crossed within two days (signal to close open trades)
double[] returnSeries = new double[0]; // Memory of the returns of each trade
double[] bought = new double[2]; // Variable to keep track of the last prices at which a trade was opened [C1, C2]
// Display info which average is used
if (useBackwardLookingMA) {
Console.WriteLine($"A moving average is used to improve results.");
} else {
Console.WriteLine($"A static average is used, this will likely yield suboptimal trading results.");
}
Console.WriteLine();
// Actual Simulation
foreach (DateTime date in DatesInBoth)
{
float p1 = MergedData[(date, "C1")];
float p2 = MergedData[(date, "C2")];
double ratioNow = p1 / p2;
double z =0;
// Define 'z' depending on the choosen average -> static or moving
if (!useBackwardLookingMA)
{
z = (ratioNow - avgRatio) / stdRatio; // the model behind the z-score will determine the strategies profitability, here a constant mean is assumed
} else if (lastRatios.Count >= movingAverageBasis)
{
var lastNRatios = lastRatios.Skip(lastRatios.Count-movingAverageBasis);
float movingAverage = lastNRatios.Average();
float movingStd = (float)Math.Sqrt(lastNRatios.Select(x => Math.Pow((x - movingAverage), 2)).Sum() / (movingAverageBasis - 1)); // Calculating the standard deviation
z = (ratioNow - movingAverage) / movingStd;
}
// Decisions can be made instantly if z is calculated using the static average, alternatively the program waits until z is properly calculated based on previos price observations
if (!useBackwardLookingMA | lastRatios.Count >= movingAverageBasis)
{
// If the programm decided on the previous day to take an action
if (DayIsActionDay)
{
// If currently no position in the market is taken
if (!tradeIsOpen)
{
// open a new position
bought[0] = p1;
bought[1] = p2;
tradeIsOpen = true;
if (StockOneIsShort)
{
Console.WriteLine($"{date}: Opened a short position on Stock 1 at {p1} and a long position on Stock 2 at {p2}, ratio {p1 / p2}");
}
else
{
Console.WriteLine($"{date}: Opened a longposition on Stock 1 at {p1} and a short position on Stock 2 at {p2}, ratio {p1 / p2}");
}
}
else
{
// Close existing position
double r1; // Measure return on Stock1
double r2; // Measure return on Stock2
if (StockOneIsShort)
{
//Closing short on Stock One
r1 = (bought[0] - p1) / bought[0];
//Closing long on Stock two
r2 = (p2 - bought[1]) / bought[1];
}
else
{
// Closing long on Stock1
r1 = (p1 - bought[0]) / bought[0];
//Closing short on Stock two
r2 = (bought[1] - p2) / bought[1];
}
// due to equal weights the return of the trade is the simple average of r1 and r2
double r = (r1 + r2) / 2;
CumReturn = (CumReturn + 1) * (1 + r) - 1; // calculate cumulative return
returnSeries = returnSeries.Append(r).ToArray(); // append last return to the memory of all returns
tradeIsOpen = false; // let the program remeber there are no open positions so next time an action is taken, it will open a new trade -> update prices in bought
Console.WriteLine($"{date}: Closed the open positions at a ratio of {p1 / p2}, the return of this trade was {100 * r}%, the cumulative return of the strategy is {100 * CumReturn}%");
}
DayIsActionDay = false; // All actions for today have been taken, the logic below determines if tomorrow will be a day to take new actions
}
if (z > actionDeviation)
{
// Signal that Stock One is relatively overvalued, so if there is no open trade, the strategy is to short stock1 and go long on stock2
if (!tradeIsOpen)
{
StockOneIsShort = true; // Trade is to short stock1
DayIsActionDay = true; // Trade will be executed next day
Console.WriteLine($"{date}: Decision to short Stock 1 ({p1}) and go long on Stock2 ({p2}), Ratio {p1 / p2} ");
}
}
else if (z < -actionDeviation)
{
// Signal that Stock One is relatively undervalued, so if there is no open trade, the strategy is to go long on stock1 and short stock2
if (!tradeIsOpen)
{
StockOneIsShort = false; // Trade is to go long on stock1
DayIsActionDay = true; // Trade will be executed next day
Console.WriteLine($"{date}: Decision to go long on Stock 1 ({p1}) and short on Stock2 ({p2}), Ratio {p1 / p2} ");
}
}
// condition to close an open positions: (day before last day, or change in presignof z) and there is an open position
if (tradeIsOpen & (counter == DatesInBoth.Length - 1 | lastZ > 0 & z < 0 | lastZ < 0 & z > 0))
{
DayIsActionDay = true;
string position = StockOneIsShort ? "short on Stock 1 and long on Stock 2" : "long on Stock 1 and short on Stock 2";
Console.WriteLine($"{date}: Decision to close the open position: {position}!");
}
}
lastRatios.Add((float)ratioNow); // update observed ratios, e.g. to calculate moving average
lastZ = z; // update last z for tomorrow
counter++; // count days
}
Console.WriteLine($"\nThe Cumulative return of the Pairs Trading Strategy between {DatesInBoth.Min()} and {DatesInBoth.Max()} for the selected stock pair would have been: {100* CumReturn}%");
}
Console.WriteLine("\nEnd of Program!");
// Unit Tests
// Note that as the EngelGranger function only uses simple regression and ADF on the residuals, these tests cover also most of the EngelGranger functionality
// a possible extension to this could be to add tests of generated cointegrated series of different orders
// or to take two arbitrary timeseries and perform the test with much higher maxDegree values... after differencing multiple times the residulas might also converge towards stationarity...
bool perforUnitTests = false;
// Test Simple Linear Regression behavior
if (perforUnitTests)
{
MultipleRegressionModel regModel = new MultipleRegressionModel();
TimeSeriesTests TestTst = new TimeSeriesTests();
var trendingdata = Enumerable.Range(0, 1000).Select(x => (double)x);
int slope = 10;
int offsett = 50;
DenseVector y = DenseVector.OfEnumerable(trendingdata.Select(x => x + offsett));
DenseVector x = DenseVector.OfEnumerable(trendingdata.Select(x => x / slope));
regModel.Fit(y, DenseMatrix.OfColumnVectors(x));
double[] betaValues = regModel.betas.ToArray();
if (Math.Abs(betaValues[0] - offsett) > 0.0001 | Math.Abs(betaValues[1] - slope) > 0.0001)
{ throw new Exception("Regression is calculating biased estimators for the simple linear case!"); }
else { Console.WriteLine("Simple Regression test passed!"); }
}
// Test Multiple Regression and behavior of Standerd errors for the
if (perforUnitTests)
{
MultipleRegressionModel regModel = new MultipleRegressionModel();
TimeSeriesTests TestTst = new TimeSeriesTests();
var trendingdata = Enumerable.Range(0, 1000).Select(x => (double)x);
int slope = 10;
int offsett = 50;
int quadraticTransform = 4;
DenseVector y = DenseVector.OfEnumerable(trendingdata.Select(x => x * x * x + offsett - x * x * quadraticTransform));
DenseVector x1 = DenseVector.OfEnumerable(trendingdata.Select(x => x));
DenseVector x2 = DenseVector.OfEnumerable(trendingdata.Select(x => x * x));
DenseVector x3 = DenseVector.OfEnumerable(trendingdata.Select(x => x * x * x / slope));
regModel.Fit(y, DenseMatrix.OfColumnVectors(x1, x2, x3));
double[] betaValues = regModel.betas.ToArray();
if (Math.Abs(betaValues[0] - offsett) > 0.0001 | Math.Abs(betaValues[1]) > 0.0001 | Math.Abs(betaValues[2] + quadraticTransform) > 0.0001 | Math.Abs(betaValues[3] - slope) > 0.0001)
{ throw new Exception("Regression is calculating biased estimators for the multiple non-linear case!"); }
else { Console.WriteLine("Multiple Regression test for estimator values passed!"); }
var errorsOFestimatorsWithGoodFit = regModel.BetaStandardDeviation().ToArray();
// now now compare the Se for random inputs
Random r = new Random();
x3 = DenseVector.OfEnumerable(Enumerable.Repeat(r.NextDouble() * 100, y.Count));
regModel.Fit(y, DenseMatrix.OfColumnVectors(x1, x2, x3));
var errorsOFestimatorsWithBadFit = regModel.BetaStandardDeviation().ToArray();
bool goodEstimate(int i)
{
return errorsOFestimatorsWithGoodFit[i] * 10 < errorsOFestimatorsWithBadFit[i];
}
if (!goodEstimate(0) | !goodEstimate(1) | !goodEstimate(2)) // after changing one variable with random noise, the standard errors of the relevant inputs should blow up
{ throw new Exception("Regression is calculating suspicious estimates for the standard errors of the regression parameters!"); }
else { Console.WriteLine("Multiple Regression test for reasonable standard errors passed!"); }
}
// Test ADF
if (perforUnitTests)
{
Random r = new Random();
int testSize = 10000;
int testlag = 20;
var noise = Enumerable.Repeat(r.NextDouble() * 100, testSize).Select(x => (float)x).ToArray();
var trending = Enumerable.Range(0, testSize).Select(x => (float)x + (float)r.NextDouble()).ToArray(); // A perfectly linear input would generate a model that is overfitted and the unittest would fail, thus we need to add some noise
if (tst.ADF(noise, testlag) > 0.01) // noise should be stationary, thus the 1% threshold -> more conservative in this case
{ throw new Exception("The Implementation of ADF cannot correctly identify a stationary process!"); }
else if (tst.ADF(trending, testlag) < 0.1) // testing a trending series should yield an high p-value, thus the 10% threshold -> more conservative in this case
{ throw new Exception("The Implementation of ADF cannot correctly identify a non-stationary process!"); }
else { Console.WriteLine("The ADF test implementation correctly distingueshes stationary and non-stationary processes!"); }
}
}
}
class TimeSeriesTests
{
// Regress one timeseries on the other, test if the residuals of this regression satisfy requirement of stationarity by performing the Augmented Dickey Fuller Test on them and returning its p-Value
public float EngelGrangerRegression(double[] values1, double[] values2, int lag)
{
if (values1.Length != values2.Length) { throw new ArgumentException($"EngelGranger test requires bot inputs to be of same lenght, the provided inputs are of length {values1.Length} and {values2.Length}"); }
// The Engel-Granger Two Step Test for Cointegration, requires a method (like ADF) to test time series data for stationarity
// simple regression of values1 = beta_0 + beta_1 * values2 + error
var regModel = new MultipleRegressionModel();
var y = DenseVector.OfArray(values1);
var x = DenseMatrix.OfColumnArrays(values2);
regModel.Fit(y, x);
float[] u = regModel.errors.Select(x=>(float)x).ToArray();
// return ADF for the residuals
return ADF(u, lag);
}
// Determine the Degree of Cointegration by taking the (lag) differences of two timeseries and performing the EngleGrangerRegression, until p-value is small enough or the maximum is reached
// will test until ADF identifies stationarity, max should be left at 3, if no cointegration of 3rd degree the method returns -1 for no cointegration
// Non-stationary inputs are assumed (this program tested earlier for stationarity of inputs, no unit test)
public int EngelGrangerDegreeOfCointegration(double[] series1, double[] series2, int maxDegree = 3, int lags = 10 ) // lag is only relevant for the ADF, should be used consistently in the programm
{
// performs the EG regression for the differenced inputs, returns the degree of differencing needed to reach stationary residuals in the regression and thus finds the order of cointegration
for (int i = 1; i <= maxDegree; i++)
{
float pValue = EngelGrangerRegression(DifferenceOfDegree(series1, i), DifferenceOfDegree(series2, i), lags);
if (pValue < 0.01)
{
return i;
}
}
return -1;
}
// Perform the Augmented Dickey Fuller Test on a timeseries with a given number of lags
public float ADF(float[] values, int lag = 1)
{
// To keep the regression feasable in terms of the proportion of features and observations, the lag will be restricted to the squareroot of all observations
// This relation was arbitrarly choosen and could be changed -> however a decreasing growth rate for the lag in the number of observations, is desirable for run time efficiency and computational precision
if (lag > Math.Sqrt(values.Length)) { lag = Convert.ToInt32(Math.Sqrt(values.Length)); } // A warning could be added
// The reasoning of the following code follows the ADF as described here: https://nwfsc-timeseries.github.io/atsa-labs/sec-boxjenkins-aug-dickey-fuller.html
// The ADF performs a regresssion where the Dependent variable y_delta is regressed on the lag one of y_t and multiple lags of the differenced timeseries
// First calculate the set of dependent variables
double[] getYDifferences()
{
double[] results = new double[values.Length - 1];
for (int i = 0; i < results.Length; i++)
{
results[i] = (double)values[i + 1] - values[i];
}
return results;
};
double[] YDifferences = getYDifferences();
// Because the ADF requires YDifferences to be regressed on its own lags (p), the number dependent variables for the observation are actually YDifferences.Lenght - lag
int length = YDifferences.Length - lag;
// alpha will be generated by allowing the regression to have an intercept, the other parameters are defined in the lines below
// beta is defined as being the coefficient of t, thus the regression will require a count variable for the observatins
int[] t_values = Enumerable.Range(0, length).ToArray();
// The regression requires the last <length> observations of y_t with lag one, this is the parameter whose coefficient will be used to create the test statistic
float[] y_1 = new float[length];
Array.Copy(values, lag, y_1, 0, length); // lag does not need to be corrected by -1 as the YDifferences are calculated looking forward
// define a function to get the differences in yDifferences with the desired shift -> shift 0 yields the dependent variable
double[] YDifferencesWithLag(int shift)
{
double[] yDiff = new double[length];
Array.Copy(YDifferences, lag - shift, yDiff, 0, length);
return yDiff;
}
// Y_t - Y_{t-1}-> the dependent variable of the regression
var y = DenseVector.OfArray(YDifferencesWithLag(0));
// define a matrix with the lagged YDifferences, as features, a constant to facilitate an intercept in the regresion is added by the regressionmodel
var x_values = new Vector<double>[lag + 2]; // plus 2 for t_values and y_1
x_values[0] = new DenseVector(t_values.Select(x => Convert.ToDouble(x)).ToArray());
x_values[1] = new DenseVector(y_1.Select(x => (double)x).ToArray());
// Fill the columns of the matrix with the lagged timeseries values (lag is increasing -> i)
for (int i = 0; i < lag; i++)
{
float[] ydiff_t_i = new float[length];
Array.Copy(values, lag - (i + 1), ydiff_t_i, 0, length);
x_values[i + 2] = new DenseVector(Array.ConvertAll(ydiff_t_i, x => (double)x));
}
var x = DenseMatrix.OfColumns(x_values);
// Perform a regression
var rm = new MultipleRegressionModel();
rm.Fit(y, x);
// return the p-Value for the Intercept
float testStatistic = (float)(((float)rm.betas.ToArray()[2]) / Math.Sqrt(rm.VarSEbyJ(2))); // cf. for ADF statistic with mutliple regressors (not shown in the link above) https://en.wikipedia.org/wiki/Augmented_Dickey%E2%80%93Fuller_test
return (float)(StudentT.CDF(0, 1, length - x.ColumnCount, testStatistic)); // The test is a onesided, testing for t-values beyond the critical threshold -> a sufficiently low CDF value here implies stationarity
}
public static double[] Difference(double[] values)
{
double[] result = new double[values.Length - 1];
for (int i = 0; i < result.Length; i++) { result[i] = values[i + 1] - values[i]; }
return result;
}
public double[] DifferenceOfDegree(double[] values, int degree)
{
for (int i=1; i<=degree;i++) { values = Difference(values); }
return values;
}
}
class MultipleRegressionModel
{
public DenseMatrix x;
public DenseVector y;
public Vector<double> betas;
public Vector<double> predictions;
public Vector<double> errors;
public double r_squared;
public double r_squared_adjusted;
private bool intercept;
public int N;
public int k;
public void Fit(DenseVector y_train, DenseMatrix x_train, bool constant = true)
{
intercept = constant;
y = y_train;
if (intercept)
{
var features = new Vector<double>[x_train.ColumnCount + 1];
features[0] = DenseVector.OfEnumerable(Enumerable.Repeat<double>(1, y.Count));
for (int column = 0; column < x_train.ColumnCount; column++) { features[1 + column] = x_train.Column(column); }
x = DenseMatrix.OfColumns(features);
}
else { x = x_train; }
N = y.Count;
k = x_train.ColumnCount;
betas = MultipleRegression.QR(x, y);
predictions = x.Transpose().LeftMultiply(betas);
errors = y - predictions;
r_squared = 1 - (Variance(errors) / Variance(y)); // R^2 = 1 - RSS/TSS
r_squared_adjusted = 1 - ((1 - r_squared) * (N - 1) / (N - k - 1)); // R^2-Adjusted = 1 - (1-R^2)(N-1) / (N-k-1)
}
public static double Variance(Vector<double> values)
{
return values.Subtract(values.Average()).PointwisePower(2).Sum() / (values.Count - 1);
}
public Vector<double> BetaStandardDeviation()
{
double s2 = ErrorVariance();
var s_squared = DenseVector.OfEnumerable(Enumerable.Repeat<double>(s2, x.ColumnCount));
var ValueMatrix = x.TransposeThisAndMultiply(x).Inverse();
var variancesOfEstimators = (s_squared * ValueMatrix).PointwiseSqrt();
if (intercept) {
// This code is to calculate the Variance of a multiple regression intercept estimator, cf. for mathematical deriviation https://math.stackexchange.com/questions/2916052/whats-the-variance-of-intercept-estimator-in-multiple-linear-regression
var y_mat = x.RemoveColumn(0); // x without the intercept column
var y_bar = y_mat.ColumnSums().ToColumnMatrix();
var matrixCalculation = y_bar.Transpose() * (y_mat.Transpose() * y_mat - (1 / N) * (y_bar * y_bar.Transpose())).Inverse() * y_bar;
var varianceBeta0 = (s2 / N + (s2 / (N * N)) * matrixCalculation).ToArray()[0, 0];
variancesOfEstimators[0] = varianceBeta0;
}
// For larger matrices the algebraic method could not calculate all values and insted yielded some Nan values, the following code replaces the missin standard errors with estimates from a new regression cf,
for (int i = 0; i < variancesOfEstimators.Count; i++) { if (double.IsNaN(variancesOfEstimators[i])) { variancesOfEstimators[i] = VarSEbyJ(i); } }
return variancesOfEstimators.PointwiseSqrt();
}
public Vector<double> TStatistics()
{
return betas/ BetaStandardDeviation();
}
public Vector<double> PValues()
{
Vector<double> result = TStatistics();
int df = intercept ? N - k - 1 : N - k;
result.PointwiseAbs().Map(t => 2 * (1 - StudentT.CDF(0, 1, df, t)), result);
return result;
}
public double VarSEbyJ(int j)
{
var new_y = DenseVector.OfVector(x.Column(j));
var new_x = DenseMatrix.OfMatrix(x.RemoveColumn(j));
var helpRegression = new MultipleRegressionModel();
helpRegression.Fit(new_y, new_x, !intercept); // here x already has a constant column, so the regression model should not add another
return Variance(errors) / (Variance(new_y)*(1-helpRegression.r_squared));
}
public double ErrorVariance() { return Variance(errors); }
}
class ParsedFile // a very simple class to make data more accessable in the code and maintaining better readablity of the code
{
public DateTime Date;
public double ClosingPrice;
}
}