In [15]:
import csv
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, Normalizer
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, roc_auc_score


def getData():
    with open('process/process_file_3.csv') as csv_file:
        reader = csv.reader(csv_file)
        data = list(reader)

    return data


def preprocessData(df):
    labelencoder = LabelEncoder()
    one_hot_encoder = OneHotEncoder()
    normalizer = Normalizer()

    # create a copy for one hot encoding
    df_ohe = df

    df[:, 0] = labelencoder.fit_transform(df[:, 0])
    df = df.astype(float)
    label_processdf = df[:, 2:6]
    label_processdf = normalizer.fit_transform(label_processdf)
    np.random.shuffle(label_processdf)

    ohe_processed = one_hot_encoder.fit_transform(df_ohe[:, 0:2]).toarray()
    print(ohe_processed.shape)
    ohe_processed = np.append(ohe_processed, df_ohe[:, 2:6], axis = 1)
    ohe_processed = normalizer.fit_transform(ohe_processed)
    np.random.shuffle(ohe_processed)

    return label_processdf, ohe_processed


def learn(data):
    labeldf, ohedf = preprocessData(data)

    X_labeled = labeldf[:, 0:(labeldf.shape[1] - 1)]
    Y_labeled = labeldf[:, -1].reshape(-1, 1)

    X_ohe = ohedf[:, 0:(ohedf.shape[1] - 1)]
    Y_ohe = ohedf[:, -1].reshape(-1, 1)

    # Created 2 sets of data to train with each model, will increase time duration but also give wider results
    x_l_train, x_l_test, y_l_train, y_l_test = train_test_split(X_labeled, Y_labeled, train_size=0.7)
    x_o_train, x_o_test, y_o_train, y_o_test = train_test_split(X_ohe, Y_ohe, train_size=0.7)

    svr_l = SVR(kernel='linear')
    svr_o = SVR(kernel='rbf')
    lr_l = LinearRegression(n_jobs=4)
    lr_o = LinearRegression(n_jobs=4)
    rf_l = RandomForestRegressor(n_jobs=4)
    rf_o = RandomForestRegressor(n_jobs=4)
    gb_l = GradientBoostingRegressor()
    gb_o = GradientBoostingRegressor()

    # fitting for simple label encoded
    svr_l.fit(x_l_train, y_l_train)
    lr_l.fit(x_l_train, y_l_train)
    rf_l.fit(x_l_train, y_l_train)
    gb_l.fit(x_l_train, y_l_train)

    # fitting for one hot encoded
    svr_o.fit(x_o_train, y_o_train)
    lr_o.fit(x_o_train, y_o_train)
    rf_o.fit(x_o_train, y_o_train)
    gb_o.fit(x_o_train, y_o_train)

    # results in both
    svr_l_predict = svr_l.predict(x_l_test)
    lr_l_predict = lr_l.predict(x_l_test)
    rf_l_predict = rf_l.predict(x_l_test)
    gb_l_predict = gb_l.predict(x_l_test)

    svr_o_predict = svr_o.predict(x_o_test)
    lr_o_predict = lr_o.predict(x_o_test)
    rf_o_predict = rf_o.predict(x_o_test)
    gb_o_predict = gb_o.predict(x_o_test)

    # evaluate for label encode
    svr_l_result = np.sqrt(mean_squared_error(y_l_test, svr_l_predict))
    lr_l_result = np.sqrt(mean_squared_error(y_l_test, lr_l_predict))
    rf_l_result = np.sqrt(mean_squared_error(y_l_test, rf_l_predict))
    gb_l_result = np.sqrt(mean_squared_error(y_l_test, gb_l_predict))

    # evaluate for one hot encode
    svr_o_result = np.sqrt(mean_squared_error(y_o_test, svr_o_predict))
    lr_o_result = np.sqrt(mean_squared_error(y_o_test, lr_o_predict))
    rf_o_result = np.sqrt(mean_squared_error(y_o_test, rf_o_predict))
    gb_o_result = np.sqrt(mean_squared_error(y_o_test, gb_o_predict))

    print("Printing for LabelEncoded Data")
    print("Test Error for SVR: ", svr_l_result)
    print("Test Error for LR: ", lr_l_result)
    print("Test Error for RFR: ", rf_l_result)
    print("Test Error for GBR: ", gb_l_result)

    print("Printing for OneHot Encoded")
    print("Test Error for SVR: ", svr_o_result)
    print("Test Error for SVR: ", lr_o_result)
    print("Test Error for SVR: ", rf_o_result)
    print("Test Error for SVR: ", gb_o_result)


def main():
    df = np.array(getData())
    learn(df)


In [16]:
main()

(21286, 756)


  check_array(X, accept_sparse='csr')
  X = check_array(X, accept_sparse='csr')
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Printing for LabelEncoded Data
Test Error for SVR:  0.01081560853703742
Test Error for LR:  0.005118058465412622
Test Error for RFR:  0.00165420276827947
Test Error for GBR:  0.001311941735925352
Printing for OneHot Encoded
Test Error for SVR:  0.01666011888951365
Test Error for SVR:  0.004732460670608759
Test Error for SVR:  0.004023270798094777
Test Error for SVR:  0.004687132741944067


In [1]:
import pickle

In [9]:
file1 = open('Train_Error_KFold_LabelEncode_3.pkl','rb')
train = pickle.load(file1)
file2 = open('Validation_Error_KFold_LabelEncode_3.pkl','rb')
validation = pickle.load(file2)
file3 = open('Test_Error_KFold_LabelEncode_3.pkl', 'rb')
test = pickle.load(file3)
print(train)

{0: [0.79536617, 0.73769647, 0.6874586, 0.6259532, 0.59313345, 0.5588263, 0.5195829, 0.4895177, 0.4485736, 0.42364576, 0.40527818, 0.3779066, 0.3550694, 0.3334509, 0.30561134, 0.2936354, 0.2799237, 0.26437733, 0.24943772, 0.23541363, 0.22180259, 0.21047196, 0.20382454, 0.19184779, 0.18202794, 0.17131786, 0.16717057, 0.15587825, 0.1487691, 0.14139022, 0.13494192, 0.12899196, 0.12073151, 0.11336898, 0.1081357, 0.101350576, 0.09743617, 0.09250263, 0.08674554, 0.0833931, 0.07771971, 0.07322786, 0.069751665, 0.065353304, 0.06172856, 0.057947557, 0.054139525, 0.0512887, 0.04883005, 0.04606863, 0.04238438, 0.039793722, 0.037007194, 0.034954563, 0.032474052, 0.031067073, 0.028309062, 0.026954673, 0.025065111, 0.02326761, 0.021859096, 0.02007185, 0.018636134, 0.01719559, 0.016083805, 0.014681658, 0.013716895, 0.012715042, 0.0118039, 0.010671718, 0.009941591, 0.009029972, 0.008354909, 0.0075644664, 0.006981458, 0.006422981, 0.0058177155, 0.005315907, 0.0048336466, 0.0043554283, 0.003958677, 0.00

In [10]:
print(validation)


{0: [array([[0.9097675 ],
       [0.927833  ],
       [0.9297391 ],
       ...,
       [0.92747337],
       [0.89927375],
       [0.913501  ]], dtype=float32), array([[0.8907375 ],
       [0.9088266 ],
       [0.9106811 ],
       ...,
       [0.90847665],
       [0.8803022 ],
       [0.89463717]], dtype=float32), array([[0.8750908 ],
       [0.89312327],
       [0.89494205],
       ...,
       [0.8927801 ],
       [0.8647299 ],
       [0.8790556 ]], dtype=float32), array([[0.8605258 ],
       [0.8784857 ],
       [0.8802725 ],
       ...,
       [0.87814856],
       [0.8502411 ],
       [0.8645365 ]], dtype=float32), array([[0.8465829 ],
       [0.8644639 ],
       [0.86622024],
       ...,
       [0.86413246],
       [0.83637524],
       [0.8506319 ]], dtype=float32), array([[0.8334834 ],
       [0.8512717 ],
       [0.85300016],
       ...,
       [0.85094553],
       [0.8233564 ],
       [0.83754873]], dtype=float32), array([[0.82102233],
       [0.83871835],
       [0.84042037],
  

In [11]:
print(test)

{0: [array([[0.705932  ],
       [0.70583856],
       [0.70716524],
       ...,
       [0.70718306],
       [0.70580584],
       [0.70705515]], dtype=float32), array([[0.70666987],
       [0.70666987],
       [0.70666987],
       ...,
       [0.70666987],
       [0.70666987],
       [0.70666987]], dtype=float32), array([[0.70693463],
       [0.70693463],
       [0.7066331 ],
       ...,
       [0.7066302 ],
       [0.70693463],
       [0.7068336 ]], dtype=float32), array([[0.70652527],
       [0.70652527],
       [0.70652527],
       ...,
       [0.70652527],
       [0.70652527],
       [0.70652527]], dtype=float32), array([[0.7053949 ],
       [0.704518  ],
       [0.7051528 ],
       ...,
       [0.7051678 ],
       [0.70451665],
       [0.70507485]], dtype=float32)], 1: [array([[0.705932  ],
       [0.70583856],
       [0.70716524],
       ...,
       [0.70718306],
       [0.70580584],
       [0.70705515]], dtype=float32), array([[0.70666987],
       [0.70666987],
       [0.70666987

In [4]:
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

In [6]:
for key in train.keys():
    plt
    y = train[key]
    x = range(len(train[key]))

117
155
89
173
188


In [None]:
for 