# Logistical Regression
## JiaoCheng Class Tuner (Brute Force)

In [1]:
import pandas as pd
import os

In [2]:
# create directory to store tuning results
output_relative_dirs = ['../data/curated/tuning']

# check if it exists as it makedir will raise an error if it does exist
for output_relative_dir in output_relative_dirs:
    if not os.path.exists(output_relative_dir):
        os.makedirs(output_relative_dir)

In [3]:
def accuracy_score(pred, obs):

    n = len(pred)
    n_tp = 0

    for i in range(n):
        if pred[i] in obs[i]:
            n_tp += 1

    return n_tp/n

# Import Data

In [4]:
# Import Data
Train_data_X = pd.read_csv('../data/curated/ModelBuilding/Discrete/XTrain_16-1_16-5.csv')
Train_data_y = pd.read_csv('../data/curated/ModelBuilding/Discrete/yTrain_16-1_16-5.csv')
XTrain = Train_data_X
yTrain = Train_data_y['Max_PULocationID']

In [5]:
Val_data_X = pd.read_csv('../data/curated/ModelBuilding/Discrete/XVal_16-5_16-6.csv')
Val_data_y = pd.read_csv('../data/curated/ModelBuilding/Discrete/yVal_16-5_16-6.csv')
XVal = Val_data_X
yVal = Val_data_y['Max_PULocationID']

In [6]:
Test_data_X = pd.read_csv('../data/curated/ModelBuilding/Discrete/XTest_16-6_16-6.csv')
Test_data_y = pd.read_csv('../data/curated/ModelBuilding/Discrete/yTest_16-6_16-6.csv')
XTest = Test_data_X
yTest = Test_data_y['Max_PULocationID']

# Extra One Hot Encoding (On the Fly)

In [7]:
# Extra One Hot Encoding
from sklearn.preprocessing import OneHotEncoder

# OHE for DOLocationID

ohe = OneHotEncoder(handle_unknown='ignore')

Train_data_to_ohe = XTrain[['DOLocationID']]
Train_data_ohe = ohe.fit_transform(Train_data_to_ohe).toarray()

Train_data_ohe = pd.DataFrame(Train_data_ohe,
                              columns=list(ohe.get_feature_names_out(['DOLocationID'])))

XTrain = Train_data_X.drop(['DOLocationID'], axis=1)

for col in Train_data_ohe.columns:
    new_col = Train_data_ohe[col]
    new_col.index = range(len(new_col))

    XTrain[str(col)] = new_col

XTrain = XTrain.drop('datetime', axis=1)

Val_data_to_ohe = XVal[['DOLocationID']]
Val_data_ohe = ohe.transform(Val_data_to_ohe).toarray()

Val_data_ohe = pd.DataFrame(Val_data_ohe,
                            columns=list(ohe.get_feature_names_out(['DOLocationID'])))

XVal = XVal.drop(['DOLocationID'], axis=1)

for col in Val_data_ohe.columns:
    new_col = Val_data_ohe[col]
    new_col.index = range(len(new_col))

    XVal[str(col)] = new_col

XVal = XVal.drop('datetime', axis=1)

Test_data_to_ohe = XTest[['DOLocationID']]
Test_data_ohe = ohe.transform(Test_data_to_ohe).toarray()

Test_data_ohe = pd.DataFrame(Test_data_ohe,
                             columns=list(ohe.get_feature_names_out(['DOLocationID'])))

XTest = XTest.drop(['DOLocationID'], axis=1)

for col in Test_data_ohe.columns:
    new_col = Test_data_ohe[col]
    new_col.index = range(len(new_col))

    XTest[str(col)] = new_col

XTest = XTest.drop('datetime', axis=1)


  XTrain[str(col)] = new_col
  XVal[str(col)] = new_col
  XTest[str(col)] = new_col


# Special format for getting data in right format for accuracy evlauation

In [8]:
# Object for evaluation with multiple correct labels
train_obs = [[int(x) for x in Max_IDs.strip('[]').split(',')]
             for Max_IDs in Train_data_y['Max_PULocationIDs']]
val_obs = [[int(x) for x in Max_IDs.strip('[]').split(',')]
           for Max_IDs in Val_data_y['Max_PULocationIDs']]
test_obs = [[int(x) for x in Max_IDs.strip('[]').split(',')]
            for Max_IDs in Test_data_y['Max_PULocationIDs']]

# Run JiaoCheng Tuning Algorithm (brute force train)

In [9]:
from sklearn.linear_model import LogisticRegression

In [10]:
C = [10, 0.1, 0.00001]
PENALTY = [['liblinear', 'l1'], ['newton-cg', 'l2']]

In [11]:
# use loop to create list of combinations
combinations = list()
for c in C:
    for penalty in PENALTY:
        tmp = [c, penalty]

        combinations.append(tmp)

In [12]:
combinations

[[10, ['liblinear', 'l1']],
 [10, ['newton-cg', 'l2']],
 [0.1, ['liblinear', 'l1']],
 [0.1, ['newton-cg', 'l2']],
 [1e-05, ['liblinear', 'l1']],
 [1e-05, ['newton-cg', 'l2']]]

In [13]:
tuning_results = pd.DataFrame()

In [14]:
# start training and scoring each combination

for i in range(len(combinations)):
    print(i)
    logR = LogisticRegression(C = combinations[i][0],
                              solver = combinations[i][1][0],
                              penalty = combinations[i][1][1],
                              max_iter = 100000)

    logR.fit(XTrain, yTrain)
    train_pred_logR = logR.predict(XTrain)
    train_accu = accuracy_score(train_pred_logR, train_obs)
    val_pred_logR = logR.predict(XVal)
    val_accu = accuracy_score(val_pred_logR, val_obs)
    test_pred_logR = logR.predict(XTest)
    test_accu = accuracy_score(test_pred_logR, test_obs)

    one_result = pd.DataFrame({'C': [combinations[i][0]],
                               'solver': [combinations[i][1][0]],
                               'penalty': [combinations[i][1][1]],
                               'training_accuracy': [train_accu],
                               'validation_accuracy': [val_accu],
                               'testing_accuracy': [test_accu]})

    tuning_results = tuning_results.append(one_result)
    tuning_results.to_csv('../data/curated/tuning/logR_Unified.csv')

0
1
2
3
4
5


In [15]:
C = [1]
PENALTY = [['liblinear', 'l1'], ['newton-cg', 'l2']]
# use loop to create list of combinations
combinations = list()
for c in C:
    for penalty in PENALTY:
        tmp = [c, penalty]

        combinations.append(tmp)

In [16]:
# start training and scoring each combination

for i in range(len(combinations)):
    print(i)
    logR = LogisticRegression(C = combinations[i][0],
                              solver = combinations[i][1][0],
                              penalty = combinations[i][1][1],
                              max_iter = 100000)

    logR.fit(XTrain, yTrain)
    train_pred_logR = logR.predict(XTrain)
    train_accu = accuracy_score(train_pred_logR, train_obs)
    val_pred_logR = logR.predict(XVal)
    val_accu = accuracy_score(val_pred_logR, val_obs)
    test_pred_logR = logR.predict(XTest)
    test_accu = accuracy_score(test_pred_logR, test_obs)

    one_result = pd.DataFrame({'C': [combinations[i][0]],
                               'solver': [combinations[i][1][0]],
                               'penalty': [combinations[i][1][1]],
                               'training_accuracy': [train_accu],
                               'validation_accuracy': [val_accu],
                               'testing_accuracy': [test_accu]})

    tuning_results = tuning_results.append(one_result)
    tuning_results.to_csv('../data/curated/tuning/logR_Unified.csv')

0
1
