In [1]:
import pandas as pd
import csv
import warnings
warnings.filterwarnings('ignore') # Disable warnings for now

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from math import sqrt

In [2]:
df_budget = pd.read_csv("../data/Expense_Budget.csv")

In [3]:
df_trans1 = pd.read_csv("../data/Trans1.csv")
df_trans2 = pd.read_csv("../data/Trans2.csv")

df_trans = pd.concat([df_trans1, df_trans2])

print(df_trans.shape)
print(df_trans.head())

(1860788, 5)
       Entity  Account        Date                Description       Amount
0  100-100001    47101  07/01/2006  GL65 import record left b -53618285.92
1  520-710182    46101  07/01/2006  GL65 import record left b -23958030.00
2  520-710132    46101  07/01/2006  GL65 import record left b -23678050.00
3  520-710182    47101  07/01/2006  GL65 import record left b -18127072.02
4  512-822300    47101  07/01/2006  GL65 import record left b -17624027.09


In [4]:
# Remove negative transactions
df_trans = df_trans[df_trans["Amount"]>0]

In [5]:
print(df_trans.shape)
print(df_trans.head())

(1215106, 5)
          Entity  Account        Date                Description  Amount
1660  100-510510    53030  07/01/2006  GL65 import record left b    0.01
1661  411-411320    41107  07/01/2006  GL65 import record left b    0.02
1662  409-409320    41107  07/01/2006  GL65 import record left b    0.08
1663  415-415320    41107  07/01/2006  GL65 import record left b    0.12
1664  407-407320    41107  07/01/2006  GL65 import record left b    0.15


In [6]:
# Convert the dates from strings to datetimes
# TODO: Figure out how to do this faster and without Python crying
df_trans["Date"] = pd.to_datetime(df_trans["Date"])
df_trans["Year"]= df_trans.Date.dt.year
df_trans["Month"]= df_trans.Date.dt.month

# Remove the Year and Description columns since they're strings
# and thus can't be used
df_trans = df_trans.drop(columns=["Date", "Description"])

print(df_trans.shape)
print(df_trans.head())

(1215106, 5)
          Entity  Account  Amount  Year  Month
1660  100-510510    53030    0.01  2006      7
1661  411-411320    41107    0.02  2006      7
1662  409-409320    41107    0.08  2006      7
1663  415-415320    41107    0.12  2006      7
1664  407-407320    41107    0.15  2006      7


In [7]:
# Number of models we'll end up with
len(df_trans["Entity"].unique())

810

In [24]:
# models = {} # Dictionary which maps entity to its model

for entity in sorted(df_trans["Entity"].unique()):
    try:
        df_entity = df_trans[df_trans["Entity"]==entity]
        df_entity = df_entity.drop(columns=["Entity"])

        x = df_entity.drop(columns=["Amount"], axis=1)
        y = df_entity["Amount"]

        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

        model = RandomForestRegressor()
        model.fit(x_train, y_train)

        # models[entity] = model

        # Model evaluation
        y_pred = model.predict(x_test)
        y_val = model.predict(x_train)

        print("[{}]".format(entity))
        print("\t{} - {}".format(len(x), len(y)))
        # print("\tTraining set RMSE: {0:.3g}".format(sqrt(mean_squared_error(y_train, y_val))))
        # print("\tTraining set R2: {0:.3g}".format(model.score(x_train, y_train)))
        print("\tTesting set RMSE: {0:.3g}".format(sqrt(mean_squared_error(y_test, y_pred))))
        print("\tTesting set R2: {0:.3g}".format(model.score(x_test, y_test)))
    except Exception:
        pass

[100-100001]
	2574 - 2574
	Testing set RMSE: 3e+05
	Testing set R2: 0.463
[100-101110]
	2753 - 2753
	Testing set RMSE: 1.84e+03
	Testing set R2: 0.717
[100-102110]
	3025 - 3025
	Testing set RMSE: 1.2e+03
	Testing set R2: 0.647
[100-105002]
	5187 - 5187
	Testing set RMSE: 5.14e+03
	Testing set R2: 0.198
[100-105110]
	2 - 2
	Testing set RMSE: 51
	Testing set R2: 0
[100-105210]
	362 - 362
	Testing set RMSE: 1.08e+03
	Testing set R2: 0.882
[100-110110]
	6039 - 6039
	Testing set RMSE: 2.7e+03
	Testing set R2: 0.909
[100-115002]
	6074 - 6074
	Testing set RMSE: 3.54e+03
	Testing set R2: 0.86
[100-115110]
	7 - 7
	Testing set RMSE: 1.22e+04
	Testing set R2: -0.429
[100-115210]
	2048 - 2048
	Testing set RMSE: 6.02e+04
	Testing set R2: 0.717
[100-120110]
	2836 - 2836
	Testing set RMSE: 1.25e+03
	Testing set R2: 0.905
[100-125110]
	2981 - 2981
	Testing set RMSE: 1.02e+03
	Testing set R2: 0.902
[100-125120]
	240 - 240
	Testing set RMSE: 65
	Testing set R2: 0.975
[100-130110]
	5314 - 5314
	Testing s

[100-210311]
	392 - 392
	Testing set RMSE: 2.05e+03
	Testing set R2: -0.771
[100-210315]
	2 - 2
	Testing set RMSE: 3.55e-15
	Testing set R2: 0
[100-210320]
	1709 - 1709
	Testing set RMSE: 1.97e+04
	Testing set R2: -1.05
[100-210325]
	1625 - 1625
	Testing set RMSE: 207
	Testing set R2: 0.961
[100-210330]
	565 - 565
	Testing set RMSE: 872
	Testing set R2: -0.276
[100-210335]
	3011 - 3011
	Testing set RMSE: 2.36e+03
	Testing set R2: 0.749
[100-210340]
	5562 - 5562
	Testing set RMSE: 2.53e+03
	Testing set R2: 0.724
[100-210345]
	1119 - 1119
	Testing set RMSE: 159
	Testing set R2: 0.977
[100-210350]
	1704 - 1704
	Testing set RMSE: 631
	Testing set R2: 0.571
[100-210355]
	5569 - 5569
	Testing set RMSE: 1.05e+04
	Testing set R2: 0.601
[100-210360]
	4101 - 4101
	Testing set RMSE: 7.91e+03
	Testing set R2: 0.0415
[100-210365]
	3691 - 3691
	Testing set RMSE: 4.51e+03
	Testing set R2: 0.486
[100-210366]
	270 - 270
	Testing set RMSE: 4.9e+03
	Testing set R2: -1.38
[100-210370]
	268 - 268
	Testing 

[100-220320]
	158 - 158
	Testing set RMSE: 1.41e+04
	Testing set R2: -0.114
[100-220330]
	18 - 18
	Testing set RMSE: 2.1e+03
	Testing set R2: 0.516
[100-220340]
	66 - 66
	Testing set RMSE: 2.13e+04
	Testing set R2: -26.7
[100-220350]
	401 - 401
	Testing set RMSE: 4.02e+04
	Testing set R2: 0.781
[100-220360]
	260 - 260
	Testing set RMSE: 2.68e+03
	Testing set R2: 0.0446
[100-220410]
	390 - 390
	Testing set RMSE: 8.81e+03
	Testing set R2: -1.46
[100-220510]
	372 - 372
	Testing set RMSE: 1.15e+03
	Testing set R2: 0.923
[100-220520]
	101 - 101
	Testing set RMSE: 6.52e+03
	Testing set R2: -4.56
[100-220530]
	3338 - 3338
	Testing set RMSE: 2.61e+04
	Testing set R2: 0.252
[100-220540]
	671 - 671
	Testing set RMSE: 6.47e+03
	Testing set R2: -0.252
[100-220550]
	290 - 290
	Testing set RMSE: 4.92e+03
	Testing set R2: -0.0407
[100-220610]
	170 - 170
	Testing set RMSE: 1.71e+04
	Testing set R2: -0.849
[100-220620]
	90 - 90
	Testing set RMSE: 1.1e+04
	Testing set R2: 0.0922
[100-220705]
	55 - 55
	T

[100-310010]
	3652 - 3652
	Testing set RMSE: 1.31e+04
	Testing set R2: 0.0962
[100-310014]
	7 - 7
	Testing set RMSE: 1.71e+04
	Testing set R2: -4.67
[100-310020]
	8003 - 8003
	Testing set RMSE: 5.39e+03
	Testing set R2: 0.622
[100-310024]
	90 - 90
	Testing set RMSE: 1.3e+04
	Testing set R2: -12.5
[100-310030]
	2579 - 2579
	Testing set RMSE: 2.19e+03
	Testing set R2: 0.907
[100-310040]
	1659 - 1659
	Testing set RMSE: 2.27e+03
	Testing set R2: 0.468
[100-310050]
	142 - 142
	Testing set RMSE: 634
	Testing set R2: -3.88
[100-310110]
	2849 - 2849
	Testing set RMSE: 3.97e+03
	Testing set R2: 0.978
[100-310113]
	101 - 101
	Testing set RMSE: 8.31e+03
	Testing set R2: 0.113
[100-310114]
	2 - 2
	Testing set RMSE: 5.77e+03
	Testing set R2: 0
[100-310120]
	1886 - 1886
	Testing set RMSE: 243
	Testing set R2: 0.964
[100-310130]
	1687 - 1687
	Testing set RMSE: 1.62e+03
	Testing set R2: 0.942
[100-310200]
	6966 - 6966
	Testing set RMSE: 1.94e+03
	Testing set R2: 0.922
[100-310201]
	1467 - 1467
	Testin

[100-410150]
	1681 - 1681
	Testing set RMSE: 300
	Testing set R2: 0.662
[100-420110]
	3099 - 3099
	Testing set RMSE: 1.28e+03
	Testing set R2: 0.71
[100-510110]
	143 - 143
	Testing set RMSE: 3.63e+04
	Testing set R2: -0.115
[100-510210]
	54 - 54
	Testing set RMSE: 7.02e+05
	Testing set R2: -15.8
[100-510510]
	3695 - 3695
	Testing set RMSE: 1.79e+04
	Testing set R2: 0.145
[100-510520]
	10 - 10
	Testing set RMSE: 8.34e+03
	Testing set R2: -17.1
[100-510521]
	309 - 309
	Testing set RMSE: 5.89e+03
	Testing set R2: 0.794
[100-510522]
	51 - 51
	Testing set RMSE: 5.63e+03
	Testing set R2: 0.931
[100-510523]
	144 - 144
	Testing set RMSE: 501
	Testing set R2: 0.993
[100-510524]
	341 - 341
	Testing set RMSE: 4.31e+03
	Testing set R2: 0.818
[100-510525]
	109 - 109
	Testing set RMSE: 322
	Testing set R2: 0.998
[100-510526]
	229 - 229
	Testing set RMSE: 1.06e+04
	Testing set R2: 0.899
[100-511210]
	28 - 28
	Testing set RMSE: 6.49e+05
	Testing set R2: -0.287
[100-515002]
	2127 - 2127
	Testing set RM

[500-100919]
	27 - 27
	Testing set RMSE: 4.9e+04
	Testing set R2: -0.371
[500-100920]
	7 - 7
	Testing set RMSE: 6.05e+04
	Testing set R2: -0.893
[500-100921]
	19 - 19
	Testing set RMSE: 9.41e+05
	Testing set R2: -689
[500-100922]
	15 - 15
	Testing set RMSE: 1.15e+04
	Testing set R2: 0.933
[500-100923]
	25 - 25
	Testing set RMSE: 9.22e+04
	Testing set R2: -2.42
[500-100924]
	7 - 7
	Testing set RMSE: 3.15e+04
	Testing set R2: 0.432
[500-200110]
	4 - 4
	Testing set RMSE: 3.76e+05
	Testing set R2: 0
[500-200111]
	140 - 140
	Testing set RMSE: 4.25e+05
	Testing set R2: -0.0668
[500-200210]
	42 - 42
	Testing set RMSE: 3.62e+04
	Testing set R2: -37.3
[500-200212]
	31 - 31
	Testing set RMSE: 6.76e+03
	Testing set R2: 0.0905
[500-200215]
	8 - 8
	Testing set RMSE: 1.33e+05
	Testing set R2: -1.42e+09
[500-200320]
	139 - 139
	Testing set RMSE: 8.42e+05
	Testing set R2: -10.5
[500-300115]
	400 - 400
	Testing set RMSE: 9.23e+05
	Testing set R2: -0.334
[500-300125]
	6 - 6
	Testing set RMSE: 3.55e+04
	

[520-710182]
	177 - 177
	Testing set RMSE: 3.97e+06
	Testing set R2: -22.2
[520-710183]
	116 - 116
	Testing set RMSE: 1.96e+06
	Testing set R2: -2.43
[520-710184]
	115 - 115
	Testing set RMSE: 8.07e+05
	Testing set R2: -6.09
[520-710185]
	96 - 96
	Testing set RMSE: 2.34e+05
	Testing set R2: -5
[520-710186]
	114 - 114
	Testing set RMSE: 5.74e+05
	Testing set R2: 0.264
[520-710187]
	64 - 64
	Testing set RMSE: 3.83e+05
	Testing set R2: 0.0143
[520-710188]
	53 - 53
	Testing set RMSE: 5.51e+05
	Testing set R2: 0.0706
[520-710189]
	130 - 130
	Testing set RMSE: 1.21e+05
	Testing set R2: -0.362
[520-710190]
	284 - 284
	Testing set RMSE: 5.58e+05
	Testing set R2: -0.225
[520-710191]
	117 - 117
	Testing set RMSE: 5.95e+05
	Testing set R2: -0.167
[520-710192]
	94 - 94
	Testing set RMSE: 4.8e+05
	Testing set R2: -20.7
[520-710193]
	73 - 73
	Testing set RMSE: 1.08e+06
	Testing set R2: -50.4
[520-710194]
	124 - 124
	Testing set RMSE: 6.53e+05
	Testing set R2: 0.641
[520-710196]
	18 - 18
	Testing set