In [18]:
import pandas as pd
import csv
import warnings
warnings.filterwarnings('ignore') # Disable warnings for now

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from math import sqrt

In [19]:
df_budget = pd.read_csv("../data/Expense_Budget.csv")

In [20]:
df_trans1 = pd.read_csv("../data/Trans1.csv")
df_trans2 = pd.read_csv("../data/Trans2.csv")

df_trans = pd.concat([df_trans1, df_trans2])

print(df_trans.shape)
print(df_trans.head())

(1860788, 5)
       Entity  Account        Date                Description       Amount
0  100-100001    47101  07/01/2006  GL65 import record left b -53618285.92
1  520-710182    46101  07/01/2006  GL65 import record left b -23958030.00
2  520-710132    46101  07/01/2006  GL65 import record left b -23678050.00
3  520-710182    47101  07/01/2006  GL65 import record left b -18127072.02
4  512-822300    47101  07/01/2006  GL65 import record left b -17624027.09


In [21]:
# Remove negative transactions
df_trans = df_trans[df_trans["Amount"]>0]

In [5]:
# Convert the dates from strings to datetimes
# TODO: Figure out how to do this faster and without Python crying
df_trans["Date"] = pd.to_datetime(df_trans["Date"])
df_trans["Year"]= df_trans.Date.dt.year
df_trans["Month"]= df_trans.Date.dt.month

# Remove the Year and Description columns since they're strings
# and thus can't be used
df_trans = df_trans.drop(columns=["Date", "Description"])

print(df_trans.shape)
print(df_trans.head())

(586957, 5)
       Entity  Account      Amount  Year  Month
2  100-100001    41201  1901125.37  2012      7
3  100-100001    41202   866051.86  2012      7
4  100-100001    41203   575521.51  2012      7
5  100-100001    41204     1378.89  2012      7
6  100-100001    41212   304822.42  2012      7


In [22]:
# Number of models we'll end up with
len(df_trans["Entity"].unique())

810

In [17]:
# models = {} # Dictionary which maps entity to its model

for entity in sorted(df_trans["Entity"].unique()):
    try:
        df_entity = df_trans[df_trans["Entity"]==entity]
        df_entity = df_entity.drop(columns=["Entity"])

        x = df_entity.drop(columns=["Amount"], axis=1)
        y = df_entity["Amount"]

        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

        model = LinearRegression(normalize=True)
        model.fit(x_train, y_train)

        # models[entity] = model

        # Model evaluation
        y_pred = model.predict(x_test)
        y_val = model.predict(x_train)

        print("[{}]".format(entity))
        print("\t{} - {}".format(len(x), len(y)))
        # print("\tTraining set RMSE: {0:.3g}".format(sqrt(mean_squared_error(y_train, y_val))))
        # print("\tTraining set R2: {0:.3g}".format(model.score(x_train, y_train)))
        print("\tTesting set RMSE: {0:.3g}".format(sqrt(mean_squared_error(y_test, y_pred))))
        print("\tTesting set R2: {0:.3g}".format(model.score(x_test, y_test)))
    except Exception:
        pass

[100-100001]
	663 - 663
	Training set RMSE: 5.85e+05
	Training set R2: 0.118
	Testing set RMSE: 3.88e+05
	Testing set R2: 0.0718
[100-101110]
	1425 - 1425
	Training set RMSE: 3.03e+03
	Training set R2: 0.0682
	Testing set RMSE: 2.74e+03
	Testing set R2: 0.0537
[100-102110]
	1598 - 1598
	Training set RMSE: 1.51e+03
	Training set R2: 0.0536
	Testing set RMSE: 1.19e+03
	Testing set R2: 0.0296
[100-105002]
	2250 - 2250
	Training set RMSE: 5.97e+03
	Training set R2: 0.00637
	Testing set RMSE: 8.38e+03
	Testing set R2: -0.00831
[100-105210]
	33 - 33
	Training set RMSE: 188
	Training set R2: 0.847
	Testing set RMSE: 201
	Testing set R2: 0.791
[100-110110]
	3385 - 3385
	Training set RMSE: 8.43e+03
	Training set R2: 0.0939
	Testing set RMSE: 8.5e+03
	Testing set R2: 0.1
[100-115002]
	2938 - 2938
	Training set RMSE: 8.12e+03
	Training set R2: 0.0976
	Testing set RMSE: 8.12e+03
	Testing set R2: 0.0927
[100-115210]
	1657 - 1657
	Training set RMSE: 9.08e+04
	Training set R2: 0.0742
	Testing set RMS

ValueError: Found array with 0 sample(s) (shape=(0, 3)) while a minimum of 1 is required.