In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_selection import RFECV
import statsmodels.formula.api as smf
from math import exp
from sklearn import linear_model, cross_validation
import scipy.stats as prr
import numpy as np
from IPython.display import display, HTML
from sklearn.metrics import mean_squared_error
from math import sqrt
import statsmodels.formula.api as sm
import itertools

## Regression Class

In [None]:
class Regression:
    def __init__(self, pathToData):
        self.data = pd.read_csv(pathToData, index_col=0)

    def getDataFrame(self):
        return self.data

    
    
    # get collinearity (r^2 value) of a pair of columns
    def getCollinearityScore(self, dependentCol, independentCol):
        array = self.data.values
        X = array[:, independentCol]
        Y = array[:, dependentCol]

        return prr.pearsonr(X, Y)
        # GET SUBTABLE CONTAINING ONLY THE INDEPENDENT COLUMN AND DEPENDENT COLUMN
        # df2 = self.data.iloc[:, [independentCol, dependentCol]]
        # CREATE FITTED MODEL USING ORDINARY LEAST SQUARES REGRESSION
        # lm = smf.ols(formula='Sales ~ TV', data=df2).fit()
        # RETURN RSQUARED VALUE FOR THE MODEL
        # return lm.rsquared

    # print collinearities (r^2 value) of each pair of columns
    def printAllCollinearities(self):
        array = self.data.values
        length_col = self.data.shape[1]
        while(0 != length_col-1):
            for x in range(0, length_col-1):
                X = array[:, x]
                Y = array[:, length_col-1]

                print("Collinearity between "+self.data.columns.values[x]+" & "+self.data.columns.values[length_col-1]+" = "+str(prr.pearsonr(X, Y)))
            length_col -= 1

    def getSubTable(self, col_names):
        array = self.data
        return array.loc[:, col_names]
            
    def getFittedRegressionModel(self, col_names, dependent_col_name):
        array = self.data
        X = array.loc[:, col_names]
        Y = array.loc[:, dependent_col_name]
        model = linear_model.LinearRegression()
        model.fit(X, Y)
        print("R^2 score: "+str(model.score(X, Y)))
        m = model.coef_
        b = model.intercept_
        # print("Intercept=" + str(b) + " Coefficient=" + str(m))
        return model

    def getBestFeaturesRCEV(self, col_names, dependent_col_name):
        array = self.data
        X = array.loc[:, col_names]
        Y = array.loc[:, dependent_col_name]
        feature_names = self.data.columns.values
        model = linear_model.LinearRegression()
        selector = RFECV(estimator=model, cv=10)
        selector.fit(X, Y)
        print(selector.ranking_)
        print("Optimal number of features3: " + str(selector.n_features_))
        print(sorted(zip(map(lambda x: round(x, 4), selector.ranking_), col_names)))
        print(selector.support_)
        
    def getXVals(self, col_names):
        #array = self.data
        array = self.data.copy()
        X = array.loc[:, col_names]
        return X
    
    def getYVals(self, col_names):
        #array = self.data
        array = self.data.copy()
        X = array.loc[:, col_names]
        return X
        
    def kFoldTest(self, col_names, dependent_col_name, test_row_num):
        #array = self.data
        array = self.data.copy()
        array = array.drop(array.index[test_row_num])
        
        X = array.loc[:, col_names]
        Y = array.loc[:, dependent_col_name]
        model = linear_model.LinearRegression()
        model.fit(X, Y)
        m = model.coef_
        b = model.intercept_
        return model
    
    def kFoldTest2(self, col_names, dependent_col_name, slice_size):
        #array = self.data
        array = self.data.copy()
        original = self.data.copy()
        iterations = len(array.index)/slice_size
        if(len(array.index)%slice_size>0):
            iterations = int(iterations)+1
            
        for j in range(0, iterations):
            for i in range(0,len(array.index)):
                if (i/slice_size==j):
                    array = array.drop(array.index[i])

            X = original.loc[:, col_names]
            Y = original.loc[:, dependent_col_name]
            model = linear_model.LinearRegression()
            model.fit(X, Y)
            m = model.coef_
            b = model.intercept_
            
            predicted_vals = []
            observed_vals = []
            array = self.data.copy()
            size = 0
            for i in range(0,len(array.index)):
                #print(str(i)+" / "+str(slice_size)+" == "+str(j))
                if (int(i/slice_size)==j):
                    #print("WENT IN!")
                    size += 1
                    print("length:"+str(len(X.index))+" index:"+str(i))
                    predicted_vals.append(model.predict(X.iloc[i , :].values.reshape(1,-1))[0])
                    print(Y.iloc[i])
                    observed_vals.append(Y.iloc[i])
            rms = sqrt(mean_squared_error(observed_vals, predicted_vals))
            rms = rms*((100*slice_size)/sum(observed_vals))
            print("Trained using slice "+str(j)+": RMSE = "+str(rms))  
            print("slice size: "+str(size))
            
    def statsmodelsRegressionSummary(self, col_names, dependent_col_name):
        array = self.data.copy()
        X = array.loc[:, col_names]
        Y = array.loc[:, dependent_col_name]
        result = sm.OLS( Y, X ).fit()
        return result

        


## Trip Generation Class

In [None]:
class TripGeneration:

    def __init__(self, pathToData, dependent_col_name):
        self.pathToData = pathToData
        self.dependent_col_name = dependent_col_name
        self.production_col_names = []
        self.production_constant = 0
        self.production_intercepts = []
        self.attraction_col_names = []
        self.attraction_constant = 0
        self.attraction_intercepts = []
        self.production_score = 0
        self.attraction_score = 0
        self.balancing_factor = 0
        
    def printAttributes(self):
        print("Attributes")
        print(self.production_col_names)
        print(self.production_constant)
        print(self.production_intercepts)
        print(self.attraction_col_names)
        print(self.attraction_constant)
        print(self.attraction_intercepts)

    def setProductionParameters(self, production_col_names, production_constant, production_intercepts):
        self.production_col_names = production_col_names
        self.production_constant = production_constant
        self.production_intercepts = production_intercepts
        
    def setAttractionParameters(self, attraction_col_names, attraction_constant, attraction_intercepts):
        self.attraction_col_names = attraction_col_names
        self.attraction_constant = attraction_constant
        self.attraction_intercepts = attraction_intercepts
        
    
    # get trip production score for 'zone'
    def getWholeTripProductionScore(self):
        data = pd.read_csv(self.pathToData, index_col=0)
        # implement specific way to get sub-table(data) just for specific 'zone' i.e: all rows related to zone1
        sub_table = data.loc[:, self.production_col_names]
        length_rows = sub_table.shape[0]
        for x in range(0, length_rows):
            row_values = sub_table.iloc[x, :].values
            self.production_score += self.production_constant
            for j in range(0,len(row_values)):
                self.production_score += int(row_values[j]*self.production_intercepts[j])
            #print("SELFPROD CURR: "+str(self.production_score))
        return int(self.production_score)

    # get trip attraction score for 'zone'
    def getWholeTripAttractionScore(self):
        data = pd.read_csv(self.pathToData, index_col=0)
        # implement specific way to get sub-table(data) just for specific 'zone' i.e: all rows related to zone1
        sub_table = data.loc[:, self.attraction_col_names]
        length_rows = sub_table.shape[0]
        for x in range(0, length_rows):
            row_values = sub_table.iloc[x, :].values
            self.attraction_score += self.attraction_constant
            for j in range(0,len(row_values)):
                self.attraction_score += int(row_values[j]*self.attraction_intercepts[j])
            #print("SELFATTR CURR: "+str(self.attraction_score))
        return int(self.attraction_score)
    
    def getZoneTripProductionScore(self, zone_number):
        self.production_score = 0
        data = pd.read_csv(self.pathToData, index_col=0)
        # implement specific way to get sub-table(data) just for specific 'zone' i.e: all rows related to zone1
        row_values = data.loc[zone_number, self.production_col_names].values
        
        self.production_score += self.production_constant
        for j in range(0,len(row_values)):
            self.production_score += row_values[j]*self.production_intercepts[j]

        return self.production_score
    
    def getZoneTripAttractionScore(self, zone_number):
        self.attraction_score = 0
        data = pd.read_csv(self.pathToData, index_col=0)
        # implement specific way to get sub-table(data) just for specific 'zone' i.e: all rows related to zone1
        row_values = data.loc[zone_number, self.production_col_names].values
        
        self.attraction_score += self.attraction_constant
        for j in range(0,len(row_values)):
            self.attraction_score += row_values[j]*self.attraction_intercepts[j]

        return self.attraction_score

    def doTripBalancing(self):
        self.balancing_factor = self.production_score/self.attraction_score
        self.attraction_score = self.balancing_factor * self.attraction_score
        self.production_score = self.balancing_factor * self.production_score
        # Implement trip balancing here VOID
        
    def getBalancingFactor(self):
        return self.balancing_factor
    
    def printAllZonalTripsProductionAttraction(self):
        productionScores = []
        attractionScores = []
        df = pd.DataFrame(columns=('Trip Production', 'Trip Atraction'))
        total_production = 0
        total_attraction = 0
        data = pd.read_csv(self.pathToData, index_col=0)
        length_rows = data.shape[0]
        for x in range(1, length_rows+1):
            attr_score = 0
            prod_score = 0
            attr_row_values = data.loc[x, self.attraction_col_names].values
            prod_row_values = data.loc[x, self.production_col_names].values
            attr_score += self.attraction_constant
            prod_score += self.production_constant
            for j in range(0,len(attr_row_values)):
                attr_score += attr_row_values[j]*self.attraction_intercepts[j]
            total_attraction += attr_score
            for j in range(0,len(prod_row_values)):
                prod_score += prod_row_values[j]*self.production_intercepts[j]
            total_production += prod_score
            df.loc[x] = [int(prod_score),int(attr_score)]
            productionScores.append(int(prod_score))
            attractionScores.append(int(attr_score))
            #print("Zone "+str(x)+": Production="+str(prod_score)+" , Attraction="+str(attr_score))
        return df, productionScores, attractionScores;
        #print("Total Production="+str(total_production)+" , Total Attraction="+str(total_attraction))
        
    def getTripProductionScores(self):
        productionScores = []
        total_production = 0
        total_attraction = 0
        data = pd.read_csv(self.pathToData, index_col=0)
        length_rows = data.shape[0]
        for x in range(1, length_rows+1):
            prod_score = 0
            prod_row_values = data.loc[x, self.production_col_names].values
            prod_score += self.production_constant
            for j in range(0,len(prod_row_values)):
                prod_score += prod_row_values[j]*self.production_intercepts[j]
            total_production += prod_score
            productionScores.append(prod_score)
        return productionScores
    
    def getProductionSubTable(self):
        data = pd.read_csv(self.pathToData, index_col=0)
        return data.loc[:, self.production_col_names]

        


# DATA
### Expected available parameters from the MMUTIS Update and Enhancement Project (MUCEP) Database

In [None]:
dummy_data = pd.read_csv('DummyDataRegression.csv', index_col=0)
dummy_data.head()

### Fused Zonal and Amenity data (OpenStreetMap)

In [None]:
dummy_data_amenities = pd.read_csv('DummyDataRegressionAmenitiex.csv', index_col=0)
dummy_data_amenities.head()

# DATA PREPARATION (Example Plan)

## Collinearity Checking using Peasrson Correlation (correlation coefficient, 2-tailed p-value)

In [None]:
data_filepath = "DummyDataRegression.csv"
regression = Regression(data_filepath)
array = regression.getDataFrame().values
print("All collinearities:")
regression.printAllCollinearities()

### Checking for Optimal Features using Recursive Feature Elimination with Cross-Validation

In [None]:
col_names = regression.getDataFrame().columns.values[0:11]
print(str(1))
dep_col_name = "trips"
print(str(2))
regression.getBestFeaturesRCEV(col_names, dep_col_name)
print(str(3))
col_names

In [None]:
data_filepath2 = "DummyDataRegressionAmenitiex.csv"
regression2 = Regression(data_filepath2)
array2 = regression2.getDataFrame().values
print("All collinearities:")
regression2.printAllCollinearities()

In [None]:
col_names2 = regression2.getDataFrame().columns.values[1:15]
print(str(1))
dep_col_name2 = "trips"
print(str(2))
regression2.getBestFeaturesRCEV(col_names2, dep_col_name2)
print(str(3))
col_names2

In [None]:
 'util_area',
       'othe_area'
        
\n Correlation coeff=0.9125, P-value=3.31e-07'

In [None]:
df = regression2.getDataFrame()
plt.plot(df["trips"], df["comm_area"], 'bo')
x = df["trips"]
y = df["comm_area"]
fit = np.polyfit(x, y, deg=1)
plt.plot(x, fit[0] * x + fit[1], color='red')
plt.xlabel('trips')
plt.ylabel('comm_area')
plt.title('Observed Data (Trips produced & # of commercial area)\n Correlation coeff=0.7024, P-value=0.001667')
plt.grid(True)
# plt.axis([0, 220, 0, 120])
plt.show()

In [None]:
df = regression2.getDataFrame()
plt.plot(df["trips"], df["park_area"], 'bo')
x = df["trips"]
y = df["park_area"]
fit = np.polyfit(x, y, deg=1)
plt.plot(x, fit[0] * x + fit[1], color='red')
plt.xlabel('trips')
plt.ylabel('park_area')
plt.title('Observed Data (Trips produced & # of park area)\n Correlation coeff=0.9400, P-value= 2.10e-08')
plt.grid(True)
# plt.axis([0, 220, 0, 120])
plt.show()

In [None]:
df = regression2.getDataFrame()
plt.plot(df["trips"], df["indu_area"], 'bo')
x = df["trips"]
y = df["indu_area"]
fit = np.polyfit(x, y, deg=1)
plt.plot(x, fit[0] * x + fit[1], color='red')
plt.xlabel('trips')
plt.ylabel('indu_area')
plt.title('Observed Data (Trips produced & # of industrial area)\n Correlation coeff=0.1509, P-value=0.5632')
plt.grid(True)
# plt.axis([0, 220, 0, 120])
plt.show()

In [None]:
df = regression2.getDataFrame()
plt.plot(df["trips"], df["agri_area"], 'bo')
x = df["trips"]
y = df["agri_area"]
fit = np.polyfit(x, y, deg=1)
plt.plot(x, fit[0] * x + fit[1], color='red')
plt.xlabel('trips')
plt.ylabel('agri_area')
plt.title('Observed Data (Trips produced & # of agricultural area)\n Correlation coeff=0.7027, P-value=0.0017')
plt.grid(True)
# plt.axis([0, 220, 0, 120])
plt.show()

In [None]:
df = regression2.getDataFrame()
plt.plot(df["trips"], df["resi_area"], 'bo')
x = df["trips"]
y = df["resi_area"]
fit = np.polyfit(x, y, deg=1)
plt.plot(x, fit[0] * x + fit[1], color='red')
plt.xlabel('trips')
plt.ylabel('resi_area')
plt.title('Observed Data (Trips produced & # of residential area)\n Correlation coeff=0.7443, P-value=0.0006')
plt.grid(True)
# plt.axis([0, 220, 0, 120])
plt.show()

In [None]:
df = regression2.getDataFrame()
plt.plot(df["trips"], df["util_area"], 'bo')
x = df["trips"]
y = df["util_area"]
fit = np.polyfit(x, y, deg=1)
plt.plot(x, fit[0] * x + fit[1], color='red')
plt.xlabel('trips')
plt.ylabel('util_area')
plt.title('Observed Data (Trips produced & # of utility area)\n Correlation coeff=0.8517, P-value=1.43e-05')
plt.grid(True)
# plt.axis([0, 220, 0, 120])
plt.show()

### Verify Collinearity through Scatter Plots

In [None]:
df = regression2.getDataFrame()
plt.plot(df["trips"], df["othe_amt"], 'bo')
x = df["trips"]
y = df["othe_amt"]
fit = np.polyfit(x, y, deg=1)
plt.plot(x, fit[0] * x + fit[1], color='red')
plt.xlabel('trips')
plt.ylabel('othe_amt')
plt.title('Observed Data (trips x othe_amt)')
plt.grid(True)
# plt.axis([0, 220, 0, 120])
plt.show()

In [None]:
df = regression2.getDataFrame()
plt.plot(df["trips"], df["sust_amt"], 'bo')
x = df["trips"]
y = df["sust_amt"]
fit = np.polyfit(x, y, deg=1)
plt.plot(x, fit[0] * x + fit[1], color='red')
plt.xlabel('trips')
plt.ylabel('sust_amt')
plt.title('Observed Data (trips x sust_amt)')
plt.grid(True)
# plt.axis([0, 220, 0, 120])
plt.show()

In [None]:
df = regression2.getDataFrame()
plt.plot(df["trips"], df["educ_amt"], 'bo')
x = df["trips"]
y = df["educ_amt"]
fit = np.polyfit(x, y, deg=1)
plt.plot(x, fit[0] * x + fit[1], color='red')
plt.xlabel('trips')
plt.ylabel('educ_amt')
plt.title('Observed Data (trips x educ_amt)')
plt.grid(True)
# plt.axis([0, 220, 0, 120])
plt.show()

In [None]:
df = regression2.getDataFrame()
plt.plot(df["trips"], df["tran_amt"], 'bo')
x = df["trips"]
y = df["tran_amt"]
fit = np.polyfit(x, y, deg=1)
plt.plot(x, fit[0] * x + fit[1], color='red')
plt.xlabel('trips')
plt.ylabel('tran_amt')
plt.title('Observed Data (trips x tran_amt)')
plt.grid(True)
# plt.axis([0, 220, 0, 120])
plt.show()

In [None]:
df = regression2.getDataFrame()
plt.plot(df["trips"], df["heal_amt"], 'bo')
x = df["trips"]
y = df["heal_amt"]
fit = np.polyfit(x, y, deg=1)
plt.plot(x, fit[0] * x + fit[1], color='red')
plt.xlabel('trips')
plt.ylabel('heal_amt')
plt.title('Observed Data (trips x heal_amt)')
plt.grid(True)
# plt.axis([0, 220, 0, 120])
plt.show()

In [None]:
df = regression2.getDataFrame()
plt.plot(df["trips"], df["fina_amt"], 'bo')
x = df["trips"]
y = df["fina_amt"]
fit = np.polyfit(x, y, deg=1)
plt.plot(x, fit[0] * x + fit[1], color='red')
plt.xlabel('trips')
plt.ylabel('fina_amt')
plt.title('Observed Data (trips x fina_amt)')
plt.grid(True)
# plt.axis([0, 220, 0, 120])
plt.show()

In [None]:
df = regression2.getDataFrame()
plt.plot(df["trips"], df["ente_amt"], 'bo')
x = df["trips"]
y = df["ente_amt"]
fit = np.polyfit(x, y, deg=1)
plt.plot(x, fit[0] * x + fit[1], color='red')
plt.xlabel('trips')
plt.ylabel('ente_amt')
plt.title('Observed Data (trips x ente_amt)')
plt.grid(True)
# plt.axis([0, 220, 0, 120])
plt.show()

In [None]:
df = regression2.getDataFrame()
plt.plot(df["trips"], df["comm_amt"], 'bo')
x = df["trips"]
y = df["comm_amt"]
fit = np.polyfit(x, y, deg=1)
plt.plot(x, fit[0] * x + fit[1], color='red')
plt.xlabel('trips')
plt.ylabel('comm_amt')
plt.title('Observed Data (trips x comm_amt)')
plt.grid(True)
# plt.axis([0, 220, 0, 120])
plt.show()

In [None]:
df = regression2.getDataFrame()
plt.plot(df["trips"], df["comm_amt"], 'bo')
x = df["trips"]
y = df["comm_amt"]
fit = np.polyfit(x, y, deg=1)
plt.plot(x, fit[0] * x + fit[1], color='red')
plt.xlabel('trips')
plt.ylabel('comm_amt')
plt.title('Observed Data (trips x comm_amt)')
plt.grid(True)
# plt.axis([0, 220, 0, 120])
plt.show()

In [None]:
df = regression.getDataFrame()
plt.plot(df["trips"], df["agri_area"], 'bo')
x = df["trips"]
y = df["agri_area"]
fit = np.polyfit(x, y, deg=1)
plt.plot(x, fit[0] * x + fit[1], color='red')
plt.xlabel('trips')
plt.ylabel('agri_area')
plt.title('Observed Data (trips x agri_area)')
plt.grid(True)
# plt.axis([0, 220, 0, 120])
plt.show()

In [None]:
Collinearity between no_hh & trips = (0.92914760415527753, 7.1205170410730196e-08)
Collinearity between avg_income & trips = (0.91992035789848337, 1.7354578828442896e-07)
Collinearity between mem_no & trips = (0.91247798270778113, 3.305852610863916e-07)
Collinearity between no_mem_educwork & trips = (0.91764041951966036, 2.1277373172145904e-07)

In [None]:
df = regression.getDataFrame()
plt.plot(df["trips"], df["no_hh"], 'bo')
x = df["trips"]
y = df["no_hh"]
fit = np.polyfit(x, y, deg=1)
plt.plot(x, fit[0] * x + fit[1], color='red')
plt.xlabel('trips')
plt.ylabel('no_hh')
plt.title('Observed Data (Trips produced & # of households)\n Correlation coeff=0.9291, P-value=7.12e-08')
plt.grid(True)
# plt.axis([0, 220, 0, 120])
plt.show()

In [None]:
df = regression.getDataFrame()
plt.plot(df["trips"], df["mem_no"], 'bo')
x = df["trips"]
y = df["mem_no"]
fit = np.polyfit(x, y, deg=1)
plt.plot(x, fit[0] * x + fit[1], color='red')
plt.xlabel('trips')
plt.ylabel('mem_no')
plt.title('Observed Data (Trips produced & # of household members)\n Correlation coeff=0.9125, P-value=3.31e-07')
plt.grid(True)
# plt.axis([0, 220, 0, 120])
plt.show()

In [None]:
df = regression.getDataFrame()
plt.plot(df["trips"], df["avg_income"], 'bo')
x = df["trips"]
y = df["avg_income"]
fit = np.polyfit(x, y, deg=1)
plt.plot(x, fit[0] * x + fit[1], color='red')
plt.xlabel('trips')
plt.ylabel('avg_income')
plt.title('Observed Data (Trips produced & Average income)\n Correlation coeff=0.9199, P-value=1.73e-07')
plt.grid(True)
# plt.axis([0, 220, 0, 120])
plt.show()

In [None]:
df = regression.getDataFrame()
plt.plot(df["trips"], df["resi_area"], 'bo')
x = df["trips"]
y = df["resi_area"]
fit = np.polyfit(x, y, deg=1)
plt.plot(x, fit[0] * x + fit[1], color='red')
plt.xlabel('trips')
plt.ylabel('resi_area')
plt.title('Observed Data (trips x resi_area)')
plt.grid(True)
# plt.axis([0, 220, 0, 120])
plt.show()

In [None]:
df = regression.getDataFrame()
plt.plot(df["trips"], df["no_mem_educwork"], 'bo')
x = df["trips"]
y = df["no_mem_educwork"]
fit = np.polyfit(x, y, deg=1)
plt.plot(x, fit[0] * x + fit[1], color='red')
plt.xlabel('trips')
plt.ylabel('no_mem_educwork')
plt.title('Observed Data (Trips produced & # of working/studying household members)\n Correlation coeff=0.9176, P-value=2.13e-07')
plt.grid(True)
# plt.axis([0, 220, 0, 120])
plt.show()

In [None]:
df = regression.getDataFrame()
plt.plot(df["Trips"], df["Autos"], 'bo')
x = df["Trips"]
y = df["Autos"]
fit = np.polyfit(x, y, deg=1)
plt.plot(x, fit[0] * x + fit[1], color='red')
plt.xlabel('Trips')
plt.ylabel('Establishments')
plt.title('Observed Data (TRIPS x AUTOS)')
plt.grid(True)
# plt.axis([0, 220, 0, 120])
plt.show()

# Generate table that contains the aggregated information per zone, given the TAZ input file and the Households input file (income, houses, # amenities of each kind)

#### Create TripGenTableModel:
wherein its attributes are the columns in the table example (Out 348, remove trips column, shape coordinate, landuse).

#### Get number of TAZs
#### Get number of landuse per TAZ
#### Create [ ][ ] of TripGenTableModel

#### SAMPLE CODE
TripGenTableModel tgtm = [# of TAZs][ ]
for n in range(0,len(TAZ)):
    LUShapes lus = getImpactingShapes(TAZ[n]) #Returns the whole shape coors of impacting landuses together with its category
    for x in range(0, len(lus)):
        tgtm[n][x] = new TripGenTableModel()
        tgtm[n][x].setShapeCoordinate(TAZ[n], lus[x]) #sets a coor shape for the overlapping of the TAZ shape and landuse shape
        
for cbms_file in cmbs_files:
    for row in cbms_file:
        lat, long = getCoor(row)
        add_info_to_respective_TGTM(lat, long, row) #adds the info of the HH to the correct zone and landuse
        
        
        
        



# Detemine landuse of each zone using the land use file

# Get the corresponding trip productions and attractions per zone given the parameters

# MODEL FITTING

### Retreival of intercepts and coefficients
**Features for Trip production**: # of HH, # of HH members, # of employed, # of enrolled, Income, Landuse and Amenity Data

**Features for Trip attraction**: # of Establishments, # of working in area, # of enrolled in area, Landuse and Amenity Data

In [None]:
dep_col_name = "trips"
data_filepath = "DummyDataRegressionHHTripsOnly2.csv"
regression = Regression(data_filepath)

data_filepath2 = "DummyDataRegressionAmenitiex.csv"
regression2 = Regression(data_filepath2)

selected_feature_names = ["no_hh","avg_income","mem_no","no_mem_educwork"]
selected_feature_names_attraction = ["sust_amt","educ_amt","tran_amt","heal_amt","fina_amt","comm_amt", "ente_amt","othe_amt",
                        "comm_area", "park_area","indu_area","agri_area","resi_area","util_area","othe_area"]
fitted_production_model = regression.getFittedRegressionModel(selected_feature_names, dep_col_name)
fitted_production_model_attr = regression2.getFittedRegressionModel(selected_feature_names_attraction, dep_col_name)

print("Prod Intercept:"+str(fitted_production_model.intercept_)+" Coef:"+str(fitted_production_model.coef_))
print("Attr Intercept:"+str(fitted_production_model_attr.intercept_)+" Coef:"+str(fitted_production_model_attr.coef_))

dummy_data = pd.read_csv('DummyDataRegressionHHTripsOnly.csv', index_col=0)
dummy_data.head()

In [None]:
print(selected_feature_names)
X_vals = regression.getXVals(selected_feature_names)
Y_vals = regression.getYVals(dep_col_name)

for x in range(len(X_vals.index)):
    Predicted_vals = []
    print("Test row: "+str(x))
    fitted_production_model2 = regression.kFoldTest(selected_feature_names, dep_col_name, x);
    #testdframe.head()
    #fitted_production_model2.predict(X_vals.iloc[x , :].values.reshape(1,-1))
    for z in range(len(X_vals.index)):
        #print("Actual Zone "+str(z)+": "+str(Y_vals.iloc[z]))
        #print("Predicted Zone "+str(z)+": "+str(fitted_production_model2.predict(X_vals.iloc[z , :].values.reshape(1,-1))[0]))
        Predicted_vals.append(fitted_production_model2.predict(X_vals.iloc[z , :].values.reshape(1,-1))[0])
    rms = sqrt(mean_squared_error(Y_vals.values, Predicted_vals))
    print("Trained using index "+str(x)+": RMS = "+str(rms))

In [None]:
rms = sqrt(mean_squared_error(Y_vals.values, Predicted_vals))
rms

In [None]:
dep_col_name = "trips"
data_filepath = "DummyDataRegression.csv"
regression = Regression(data_filepath)

data_filepath2 = "DummyDataRegressionAmenitiex.csv"
regression2 = Regression(data_filepath2)

selected_feature_names = ["no_hh","avg_income","mem_no","no_mem_educwork","comm_area","park_area", "indu_area","agri_area",
                         "resi_area", "util_area","othe_area"]
selected_feature_names_attraction = ["sust_amt","educ_amt","tran_amt","heal_amt","fina_amt","comm_amt", "ente_amt","othe_amt",
                        "comm_area", "park_area","indu_area","agri_area","resi_area","util_area","othe_area"]
fitted_production_model = regression.getFittedRegressionModel(selected_feature_names, dep_col_name)
fitted_production_model_attr = regression2.getFittedRegressionModel(selected_feature_names_attraction, dep_col_name)

print("Prod Intercept:"+str(fitted_production_model.intercept_)+" Coef:"+str(fitted_production_model.coef_))
print("Attr Intercept:"+str(fitted_production_model_attr.intercept_)+" Coef:"+str(fitted_production_model_attr.coef_))

dummy_data = pd.read_csv('DummyDataRegression.csv', index_col=0)
dummy_data.head()

In [None]:
print(selected_feature_names)
X_vals = regression.getXVals(selected_feature_names)
Y_vals = regression.getYVals(dep_col_name)

for x in range(len(X_vals.index)):
    Predicted_vals = []
    print("Test row: "+str(x))
    fitted_production_model2 = regression.kFoldTest(selected_feature_names, dep_col_name, x);
    #testdframe.head()
    #fitted_production_model2.predict(X_vals.iloc[x , :].values.reshape(1,-1))
    for z in range(len(X_vals.index)):
        #print("Actual Zone "+str(z)+": "+str(Y_vals.iloc[z]))
        #print("Predicted Zone "+str(z)+": "+str(fitted_production_model2.predict(X_vals.iloc[z , :].values.reshape(1,-1))[0]))
        Predicted_vals.append(fitted_production_model2.predict(X_vals.iloc[z , :].values.reshape(1,-1))[0])
    rms = sqrt(mean_squared_error(Y_vals.values, Predicted_vals))
    print("Trained using index "+str(x)+": RMS = "+str(rms))

In [None]:
print(selected_feature_names)
X_vals = regression.getXVals(selected_feature_names)
Y_vals = regression.getYVals(dep_col_name)
Predicted_vals = []
for x in range(len(X_vals.index)):
    
    print("Test row: "+str(x))
    fitted_production_model2 = regression.kFoldTest(selected_feature_names, dep_col_name, x);
    #testdframe.head()
    #fitted_production_model2.predict(X_vals.iloc[x , :].values.reshape(1,-1))
    #fitted
    print("Actual Zone "+str(x)+": "+str(Y_vals.iloc[x]))
    print("Predicted Zone "+str(x)+": "+str(fitted_production_model2.predict(X_vals.iloc[x , :].values.reshape(1,-1))[0]))
    Predicted_vals.append(fitted_production_model2.predict(X_vals.iloc[x , :].values.reshape(1,-1))[0])

In [None]:
rms = sqrt(mean_squared_error(Y_vals.values, Predicted_vals))
rms

In [None]:
fitted_production_model2, testdframe = regression.kFoldTest(selected_feature_names, dep_col_name, 1)
X_vals = regression.getXVals(selected_feature_names)
testdframe.head()
fitted_production_model2.predict(X_vals.loc[1 , :])

### Prediction of Trip Production & Attraction using the intercepts and coefficients

In [None]:
trip_gen = TripGeneration("DummyDataRegressionFused.csv", "trips")
prod_col_names = selected_feature_names
attr_col_names = selected_feature_names_attraction
trip_gen.setProductionParameters(prod_col_names, fitted_production_model.intercept_, fitted_production_model.coef_)
trip_gen.setAttractionParameters(attr_col_names, fitted_production_model_attr.intercept_, fitted_production_model_attr.coef_)
print("Example:")
print("Overall Trip Production: "+str(trip_gen.getWholeTripProductionScore()))
print("Overall Trip Attraction: "+str(trip_gen.getWholeTripAttractionScore()))

### Computing for Trip Balancing Factor

In [None]:
trip_gen.doTripBalancing()
print("Balancing Factor: "+str(trip_gen.getBalancingFactor()))


### Zonal-level Trip Production & Attraction Scores

In [None]:
zonal_df, productions, attractions = trip_gen.printAllZonalTripsProductionAttraction()
display(zonal_df)

# Model Validation

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn import svm

fitted_production_model = regression.getFittedRegressionModel(selected_feature_names, dep_col_name)
X_digits = regression.getSubTable(selected_feature_names)
Y_digits = regression.getSubTable(dep_col_name)
scores = cross_validation.cross_val_score(fitted_production_model, X_digits, Y_digits, scoring='r2', cv=10,)
#X_digits.shape, Y_digits.shape
scores

# MODEL OUTPUT

# Trip Distribution Class

In [16]:

from IPython.display import HTML, display
import math as math


class TripDistribution:

    def __init__(self, productions, attractions, travelTime, fare, income):
        self.productions = productions
        self.attractions = attractions
        self.travelTime = travelTime
        self.fare = fare
        self.income = income
        self.row = len(productions)
        self.col = len(attractions)
        self.possibleError = sum(productions) * 0.2
        self.error = 0

    def getGeneralizedCost(self, cost):
        return 1.0 / (cost * cost)
    
    def computeCost(self, travelTime, fare, income):
        costMatrix = [[1 for x in range(self.row)] for y in range(self.col)]
        for x in range(self.row):
            for y in range(self.col):
                costMatrix[x][y] = travelTime[x][y] * income[x] + fare[x][y]
        return costMatrix

    def getTripDistribution(self):
        distributions = [[self.attractions[y] for x in range(self.row)] for y in range(self.col)]
        finalDistributions = [[self.attractions[y] for x in range(self.row)] for y in range(self.col)]
        #costMatrix = [[1 for x in range(self.row)] for y in range(self.col)]
        costMatrix = self.computeCost(self.travelTime, self.fare, self.income)
        A = [1 for x in range(self.row)]
        B = [1 for x in range(self.col)]
        A = self.computeA(B, costMatrix)
        B = self.computeB(A, costMatrix)
        
        currentBalancingFactor = 0  # 0 for A, 1 for B
        isConvergent = False
        shit =0
        smallestError = 1000000000

#         while isConvergent == False:
        for x in range(1000):
            if currentBalancingFactor == 0:
                tempA = self.computeA(B, costMatrix)
                A = tempA
                currentBalancingFactor = 1
            elif currentBalancingFactor == 1:
                tempB = self.computeB(A, costMatrix)
                B = tempB
                currentBalancingFactor = 0
            distributions = self.computeDistributions(A, B, costMatrix)
            error = self.getError(distributions)
            if(smallestError > error and error != 0):
                print(error)
                print(distributions)
                smallestError = error
                finalDistributions = distributions
                self.error = error
                shit = x
#             isConvergent = self.checkIfConvergent(distributions)
        print(shit)
        return finalDistributions

    def computeDistributions(self, A, B, costMatrix):
        distributions = [[self.attractions[y] for x in range(self.row)] for y in range(self.col)]
        for x in range(self.row):
            for y in range(self.col):
                distributions[x][y] = A[x] * self.productions[x] * B[y] * self.attractions[y] * self.getGeneralizedCost(costMatrix[x][y])
        return distributions

    def checkIfConvergent(self, distributions):
        error = self.getError(distributions)
        if error <= self.possibleError:
            self.error = error
            return True
        return False

    def getError(self, distributions):
        error = 0
        derivedProductions = [0 for x in range(self.row)]
        derivedAttractions = [0 for x in range(self.col)]

        for x in range(self.row):
            for y in range(self.col):
                derivedProductions[x] += distributions[x][y]
                derivedAttractions[y] += distributions[x][y]

        for x in range(self.row):
            error += abs(derivedProductions[x] - self.productions[x])
            error += abs(derivedAttractions[x] - self.attractions[x])

        return error
    
    
    

    def computeA(self, B, costMatrix):
        A = [1 for x in range(self.row)]
        for x in range(0, self.row):
            sum = 0.0
            for y in range(0, self.col):
                sum += B[y] * self.attractions[y] * self.getGeneralizedCost(costMatrix[x][y])
            A[x] = 1.0 / sum
        return A

    def computeB(self, A, costMatrix):
        B = [1 for x in range(self.col)]
        for x in range(0, self.row):
            sum = 0.0
            for y in range(0, self.col):
                sum += A[y] * self.productions[y] * self.getGeneralizedCost(costMatrix[x][y])
            B[x] = 1.0 / sum
        return B

# Data

## The result from trip generation (trip production and attraction) will be passed to trip distribution

In [17]:
def computeYearlyToHourlyRate(salary):
    return salary/(30.00 * 8)

def deg2rad(deg):
    return deg * (math.pi/180)

def getDistance(lat1, lng1, lat2, lng2):
    dlon = lng2 - lng1
    dlat = lat2 - lat1
    a = ((math.sin(deg2rad(dlat/2)))*(math.sin(deg2rad(dlat/2)))) + math.cos(deg2rad(lat1)) *math.cos(deg2rad(lat2)) *((math.sin(deg2rad(dlon/2)))*(math.sin(deg2rad(dlon/2))))
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
    d =  6373* c
    return d



def computeNearestZones(lat, lng) :
    nearestZones = [None] * len(lat)
    for currZone in range(len(lat)):
        shortest = 1000
        nearestZoneIndex = 0
        for currPoint in range(len(lat)):
            if(currZone != currPoint): 
                distance = getDistance(lat[currZone], lng[currZone], lat[currPoint], lng[currPoint])
                if(distance < shortest):
                    shortest = distance
                    nearestZoneIndex = currPoint
        nearestZones[currZone] = nearestZoneIndex
        shortest = 1000
        nearestZoneIndex = 0
    return nearestZones

productions = [
    102989, 
    73253,
    92057,
    15821,
    56998, 
    103039, 
    32610, 
    172844
]
attractions = [
    79305.39, 
    81250.91, 
    76218.9, 
    11167.77, 
    38786.66, 
    133188.75, 
    50085.41, 
    179607.21
]

nearestZones = [
    0, 
    0,
    0,
    0,
    0,
    0,
    0,
    0
]

lat = [
    14.711029630751996,
    14.681442858628275,
    14.708693067291241, 
    14.686174778272566, 
    14.660783607519472,
    14.65615255044289,
    14.630416868404804,
    14.637488044822764
]

lng = [
    120.960596201819,
    120.98000062117379,
    121.00151327979542,
    121.00714359391715,
    121.09957600978247,
    121.11920195444038,
    121.080185938878,
    121.09795399889786
]

travelTime = [
        [001.00, 027.00, 053.67, 056.67, 140.67, 142.67, 114.67, 123.67], 
        [028.33, 001.00, 050.67, 051.00, 124.33, 132.00, 098.33, 106.67],
        [052.33, 050.33, 001.00, 037.00, 127.67, 143.00, 109.00, 117.33], 
        [057.00, 051.00, 037.00, 001.00, 122.67, 130.00, 096.33, 104.67],
        [151.33, 133.67, 136.00, 131.33, 001.00, 031.67, 038.67, 026.67],
        [157.00, 139.33, 149.67, 137.33, 032.00, 001.00, 041.67, 030.33],
        [120.00, 102.00, 113.00, 096.67, 037.00, 040.67, 001.00, 015.00], 
        [129.00, 111.00, 121.67, 105.33, 029.00, 029.67, 015.00, 001.00]
        ]

fares = [
        [01.00, 10.67, 17.65, 25.33, 54.35, 53.55, 47.00, 48.65], 
        [12.00, 01.00, 56.65, 16.37, 45.60, 45.14, 38.58, 40.23],
        [17.65, 16.22, 01.00, 10.67, 43.72, 46.08, 40.25, 40.57], 
        [16.67, 18.67, 10.67, 01.00, 42.02, 41.64, 35.08, 36.73],
        [52.00, 43.33, 40.97, 39.41, 01.00, 10.67, 11.31, 08.00], 
        [52.77, 44.10, 45.35, 40.27, 08.00, 01.00, 10.64, 08.03],
        [45.17, 36.50, 37.83, 34.50, 11.01, 10.24, 01.00, 08.00], 
        [47.43, 38.76, 40.09, 36.03, 09.33, 08.00, 08.00, 01.00]
        ]
salary = [computeYearlyToHourlyRate(24570.49), 
          computeYearlyToHourlyRate(16576.02), 
          computeYearlyToHourlyRate(21038.92), 
          computeYearlyToHourlyRate(16985.72), 
          computeYearlyToHourlyRate(29038.00), 
          computeYearlyToHourlyRate(30548.05), 
          computeYearlyToHourlyRate(27276.91), 
          computeYearlyToHourlyRate(39221.55)];
            
        

    
nearestZones = computeNearestZones(lat, lng)

for x in range(len(productions)): 
    fares[x][x] = fares[x][nearestZones[x]] / 2
    travelTime[x][x] = travelTime[x][nearestZones[x]] / 2

td = TripDistribution(productions, attractions, travelTime, fares, salary)

print("Productions: " + str(productions))
print("Attractions: " + str(attractions))

3.8972919033706943
3.8972919033706943
3.8137190579442217
2.9675509846852295
4.409695353064177
3.8137190579442217
2.576917469557242
5.7204275656322325
2.9675509846852295
2.576917469557242
15.963942541851932
13.070287911910077
11.820569829043027
10.3391369702715
2.173848992699761
18.12430372972955
15.240532763981337
13.946521878486973
12.511710142114591
2.173848992699761
15.684105726598903
12.183710760129195
12.143735071696549
10.01221924018903
3.9702638544076576
2.0676624512310244
16.89257423199684
13.601889082590215
13.054326989667869
11.172150066414227
2.5970341357325517
2.0676624512310244
Productions: 592941
Attractions: 612940.9899999999
[[5.335, 10.67, 17.65, 25.33, 54.35, 53.55, 47.0, 48.65], [12.0, 8.185, 56.65, 16.37, 45.6, 45.14, 38.58, 40.23], [17.65, 16.22, 5.335, 10.67, 43.72, 46.08, 40.25, 40.57], [16.67, 18.67, 10.67, 5.335, 42.02, 41.64, 35.08, 36.73], [52.0, 43.33, 40.97, 39.41, 5.335, 10.67, 11.31, 8.0], [52.77, 44.1, 45.35, 40.27, 8.0, 4.0, 10.64, 8.03], [45.17, 36.5, 

In [18]:
distribution = td.getTripDistribution()

display(HTML(
    '<table><tr>{}</tr></table>'.format(
        '</tr><tr>'.join(
            '<td>{}</td>'.format('</td><td>'.join(str(_) for _ in row)) for row in distribution)
        )
))

114352.19823434926
[[70247.60194165187, 17836.721207754777, 5177.564512112711, 708.6143610108078, 254.6933718503441, 1315.0642623606677, 387.26335068681584, 1170.4769925720236], [23736.093182243825, 29843.442113731326, 8462.379962851233, 1307.0744610905363, 485.8564548234489, 2290.4495442977154, 784.6063446350653, 2344.09793632684], [7160.260973292083, 7864.446024604624, 66726.54687384782, 2551.3182231600554, 473.42226570015305, 2004.7133857232973, 656.1343787187459, 1991.1578749532155], [1916.3023408660729, 2426.1493864133236, 5291.625275353656, 3237.241098819201, 162.61928987149386, 769.3646502220797, 266.4070552224317, 793.2909032317413], [383.6269028994707, 499.54438025502157, 552.9853331748842, 90.69803214254205, 13791.707489510116, 18308.65946094935, 2339.4193596117966, 17282.35904145682], [375.1736559394566, 483.97094188391424, 480.5255466163973, 87.30670726551925, 3559.303455143602, 75600.29813911818, 2121.61059029445, 14071.810963738464], [141.2557134776913, 198.63381027852913

0,1,2,3,4,5,6,7
68928.45356272021,18153.558723550035,5821.363151743,781.3398338581621,294.9424330091603,1714.5868545477877,356.6171400884119,1047.1383004832478
22721.359465954272,29631.5046840644,9282.174689127269,1406.0099295056748,548.890399664793,2913.3407650643567,704.8645553075121,2045.8555113117227
6384.273359237667,7273.282850110643,68173.15841569126,2556.290688788163,498.1772441321383,2375.091771732585,549.0402688155403,1618.685401492026
1745.050892085087,2291.6132450304863,5521.604438201799,3312.6999166400083,174.77073427707202,930.9401844346642,227.67669122143968,658.6438981094426
339.36180237924145,458.3610420336835,560.5310264373448,90.16017600409896,14398.74111442678,21520.67709486677,1942.185752513499,13938.981991338584
300.0719734362416,401.5060598539057,440.3943335346348,78.47000725940435,3359.7789045365585,80345.59837149204,1592.5297566276347,10261.6505932596
146.03224459641936,212.9980509303399,219.70132550531787,45.00849742957519,714.5098760010629,3538.374546219186,13920.47859616104,11881.896863157055
301.3051469865538,428.80641875762313,451.73373088308034,90.39314787303242,2770.979426238998,15836.6007665557,8303.96067018563,113406.2206925194


# Error

In [19]:
print("Error: " + str(td.error))

Error: 79897.3333981335


# Modal Split Class

In [None]:
import random
class ModalSplit:

    def __init__(self, od_matrix, pathToData, income, fares, travelTimes):
        self.od_matrix = od_matrix
        self.pathToData = pathToData
        self.travel_costs = []
        self.travel_probabilities = []
        self.modes = ['jeep','bus']
        self.income = income
        self.fares = fares
        self.travelTimes = travelTimes
        self.travel_costs = [None] * len(self.modes)
        
    def computeGeneralizedCosts(self, mode_number):
        costMatrix = [[1 for x in range(len(self.od_matrix))] for y in range(len(self.od_matrix))]
        for x in range(len(self.od_matrix)):
            for y in range(len(self.od_matrix)):
                costMatrix[x][y] = (self.travelTimes[mode_number][x][y]/60) * self.income[x] + self.fares[mode_number][x][y]
                if(x == 1 and y == 2):
                    print(self.travelTimes[mode_number][x][y])
                    print(self.income[x])
                    print(self.fares[mode_number][x][y])
                    print(costMatrix[x][y])
            
        return costMatrix
        
#         #data = pd.read_csv(self.pathToData, index_col=0)
            
#         self.travel_costs = [None] * len(self.modes)
#         for x in range(0, len(self.modes)):
#             self.travel_costs[x] = random.randrange(1,4)
#         #Compute for generalized cost for each mode for this specific zone
#         # populate self.travel_costs with the travel costs
#         self.computeModalProbabilities()
        
    def computeModalProbabilities(self, mode_number, beta):
        travel_probabilities = [[1 for x in range(len(self.od_matrix))] for y in range(len(self.od_matrix))]
        sum = 0
        #print(len(self.travel_costs))
        for x in range(len(self.od_matrix)):
            for y in range(len(self.od_matrix)):
                sum = 0
                for k in range(len(self.modes)):
                    sum += math.e ** ((-beta)*self.travel_costs[k][x][y])
                print (sum)
                travel_probabilities[x][y] = math.e ** ((-beta)*self.travel_costs[mode_number][x][y]) / sum
        
        print(travel_probabilities)
        return travel_probabilities
        #print(self.travel_costs)
        #print(self.travel_probabilities)
        
    def getBeta(self, mode_number):
        sum = 0
        for x in range(len(self.od_matrix)):
            for y in range(len(self.od_matrix)):
                sum += self.travel_costs[mode_number][x][y]
        return 1/(sum/(len(self.od_matrix) * len(self.od_matrix)))
    
    def getSplittedTrips(self, mode_number):
        splittedTrips = [[1 for x in range(len(self.od_matrix))] for y in range(len(self.od_matrix))]
        for x in range(len(self.od_matrix)):
            for y in range(len(self.od_matrix)):
                splittedTrips[x][y] = self.od_matrix[x][y] * self.travel_probabilities[mode_number][x][y]
        
        return splittedTrips

    def process_od_matrix(self):
        #print("size:"+str(len(self.od_matrix))+","+str(len(self.od_matrix[0])))
        #df = DataFrame(columns=('lib', 'qty1', 'qty2'))
        #for i in range(5):
            #df.loc[i] = [randint(-1,1) for n in range(3)]
        for x in range(len(self.modes)):
            self.travel_costs[x] = self.computeGeneralizedCosts(x)
        
        beta = [None] * len(self.modes)
        for x in range(len(self.modes)):
            beta[x] = self.getBeta(x)
        
        self.travel_probabilities = [None] * len(self.modes)
        for x in range(len(self.modes)):
            self.travel_probabilities[x] = self.computeModalProbabilities(x, beta[x]);
        
        final_matrices = [None] * len(self.modes)
        for x in range(len(self.modes)):
            final_matrices[x] = self.getSplittedTrips(x)
            
        return final_matrices
                

In [None]:
fareBus = [
    [00.00, 12.00, 12.00, 31.92, 55.08, 56.58, 38.75, 41.75],
    [12.00, 00.00, 25.08, 16.00, 46.83, 48.33, 30.83, 33.83],
    [12.00, 25.00, 00.00, 12.00, 48.42, 49.92, 32.42, 35.42],
    [32.25, 16.00, 12.00, 00.00, 43.17, 44.67, 27.33, 30.33],
    [55.08, 46.50, 47.67, 42.67, 00.00, 00.01, 00.01, 00.01],
    [59.42, 50.83, 52.00, 47.00, 00.01, 00.00, 00.01, 00.01],
    [34.75, 26.00, 27.33, 22.25, 09.13, 08.25, 00.00, 06.25],
    [47.75, 39.00, 40.33, 35.25, 06.00, 06.00, 00.01, 00.00]
]

travelTimeBus = [
    [000.00, 027.00, 062.00, 075.33, 172.33, 189.33, 130.00, 157.67],
    [028.33, 000.00, 071.00, 054.33, 156.00, 173.33, 113.67, 141.67],
    [062.00, 071.00, 000.00 ,037.00, 166.67, 184.33, 124.00, 152.00],
    [076.00, 054.33, 037.00, 000.00, 154.00, 171.33, 111.67, 139.67],
    [177.00, 159.00, 169.33, 157.33, 000.00, 036.00, 070.00, 039.00],
    [198.67, 180.67, 191.00, 178.67, 036.00, 000.00, 079.00, 047.00],
    [136.00, 118.00, 128.33, 116.00, 062.00, 072.50, 000.00, 033.50],
    [165.67, 148.33, 158.33, 146.00, 034.50, 044.50, 031.00, 000.00]
]

fareJeep = [
    [00.00, 08.00, 16.32, 21.33, 45.06, 47.37, 42.31, 42.35],
    [08.00, 00.00, 16.22, 08.37, 41.79, 42.24, 37.18, 37.23],
    [16.32, 16.22, 00.00, 08.00, 39.64, 43.83, 41.15, 38.91],
    [21.33, 08.47, 08.00, 00.00, 32.68, 35.37, 31.51, 31.53],
    [46.88, 43.99, 40.63, 32.87, 00.00, 10.67, 11.31, 08.00],
    [50.86, 46.95, 46.64, 35.79, 08.00, 00.00, 10.64, 08.03],
    [43.81, 37.56, 41.61, 31.76, 11.01, 10.30, 00.00, 08.00],
    [43.88, 39.71, 39.61, 31.78, 08.00, 08.00, 08.00, 00.00]
]

travelTimeJeep = [
    [000.00, 028.33, 053.67, 056.67, 144.67, 152.33, 118.00, 126.67], 
    [029.00, 000.00, 050.67, 047.67, 132.33, 134.67, 100.33, 109.00],
    [053.33, 050.33, 000.00, 037.00, 132.00, 137.33, 114.33, 118.33],
    [057.00, 047.67, 037.00, 000.00, 117.33, 122.33, 088.00, 096.67],
    [147.67, 141.67, 133.33, 118.33, 000.00, 031.67, 038.67, 026.67],
    [157.33, 142.67, 139.00, 124.00, 032.00, 000.00, 041.67, 030.33],
    [121.00, 109.67, 115.00, 088.00, 037.00, 040.67, 000.00, 015.00],
    [130.00, 115.67, 119.33, 096.33, 029.00, 029.67, 015.00, 000.00]
]


for x in range(len(productions)): 
    fareBus[x][x] = fareBus[x][nearestZones[x]] / 2
    travelTimeBus[x][x] = travelTimeBus[x][nearestZones[x]] / 2
    fareJeep[x][x] = fareJeep[x][nearestZones[x]] / 2
    travelTimeJeep[x][x] = travelTimeJeep[x][nearestZones[x]] / 2

fares = [fareJeep, fareBus]
travelTimes = [travelTimeJeep, travelTimeBus]

modal_split = ModalSplit(distribution,"datapath", salary, fares, travelTimes)
list_of_dataframes_by_mode = modal_split.process_od_matrix()

## Mode 1 (Jeep)

In [None]:
display(HTML(
    '<table><tr>{}</tr></table>'.format(
        '</tr><tr>'.join(
            '<td>{}</td>'.format('</td><td>'.join(str(_) for _ in row)) for row in list_of_dataframes_by_mode[0])
        )
))

## Mode 2 (Bus)

In [None]:
display(HTML(
    '<table><tr>{}</tr></table>'.format(
        '</tr><tr>'.join(
            '<td>{}</td>'.format('</td><td>'.join(str(_) for _ in row)) for row in list_of_dataframes_by_mode[1])
        )
))

## Mode 3

## TRIP GENERATION TESTING #7:
Compare the results of including and excluding the landuse dataset to the travel demand model by cross-checking it with the results of other studies

### 1.) without LANDUSE variables

In [None]:
dep_col_name = "trips"
data_filepath = "DummyDataRegressionHHTripsOnly2.csv"
regression = Regression(data_filepath)

data_filepath2 = "DummyDataRegressionAmenitiex.csv"
regression2 = Regression(data_filepath2)

selected_feature_names = ["no_hh","avg_income","mem_no","no_mem_educwork"]
selected_feature_names_attraction = ["sust_amt","educ_amt","tran_amt","heal_amt","fina_amt","comm_amt", "ente_amt","othe_amt",
                        "comm_area", "park_area","indu_area","agri_area","resi_area","util_area","othe_area"]
fitted_production_model = regression.getFittedRegressionModel(selected_feature_names, dep_col_name)
fitted_production_model_attr = regression2.getFittedRegressionModel(selected_feature_names_attraction, dep_col_name)

#print("Prod Intercept:"+str(fitted_production_model.intercept_)+" Coef:"+str(fitted_production_model.coef_))
#print("Attr Intercept:"+str(fitted_production_model_attr.intercept_)+" Coef:"+str(fitted_production_model_attr.coef_))

dummy_data = pd.read_csv('DummyDataRegressionHHTripsOnly.csv', index_col=0)
dummy_data.head()

## Additional regression info:

In [None]:
dep_col_name = "trips"
data_filepath = "DummyDataRegressionHHTripsOnly2.csv"
regression = Regression(data_filepath)

data_filepath2 = "DummyDataRegressionAmenitiex.csv"
regression2 = Regression(data_filepath2)

selected_feature_names = ["no_hh","avg_income","mem_no","no_mem_educwork"]
selected_feature_names_attraction = ["sust_amt","educ_amt","tran_amt","heal_amt","fina_amt","comm_amt", "ente_amt","othe_amt",
                        "comm_area", "park_area","indu_area","agri_area","resi_area","util_area","othe_area"]
result = regression.statsmodelsRegressionSummary(selected_feature_names, dep_col_name)
result.summary()

## K-fold validation (6-fold) and measuring by % RMSE values

In [None]:
totalrmse = 0
print(selected_feature_names)
X_vals = regression.getXVals(selected_feature_names)
Y_vals = regression.getYVals(dep_col_name)
regression.kFoldTest2(selected_feature_names, dep_col_name, 3)


## K-fold validation (17-fold)

In [None]:
totalrmse = 0
print(selected_feature_names)
X_vals = regression.getXVals(selected_feature_names)
Y_vals = regression.getYVals(dep_col_name)

for x in range(len(X_vals.index)):
    Predicted_vals = []
    print("Test row: "+str(x))
    fitted_production_model2 = regression.kFoldTest(selected_feature_names, dep_col_name, x);
    #testdframe.head()
    #fitted_production_model2.predict(X_vals.iloc[x , :].values.reshape(1,-1))
    for z in range(len(X_vals.index)):
        #print("Actual Zone "+str(z)+": "+str(Y_vals.iloc[z]))
        #print("Predicted Zone "+str(z)+": "+str(fitted_production_model2.predict(X_vals.iloc[z , :].values.reshape(1,-1))[0]))
        Predicted_vals.append(fitted_production_model2.predict(X_vals.iloc[z , :].values.reshape(1,-1))[0])
    rms = sqrt(mean_squared_error(Y_vals.values, Predicted_vals))
    print("raw RMSE: "+str(rms))
    rms = rms*((100*17)/sum(Y_vals.values))
    totalrmse += rms
    print("Trained using index "+str(x)+": %RMSE = "+str(rms))
print("TOTAL RMSE: "+str(totalrmse))

### 2.) with LANDUSE variables

In [None]:
dummy_data = pd.read_csv('DummyDataRegression.csv', index_col=0)
dummy_data.head()

In [None]:
dep_col_name = "trips"
data_filepath = "DummyDataRegression.csv"
regression = Regression(data_filepath)

data_filepath2 = "DummyDataRegressionAmenitiex.csv"
regression2 = Regression(data_filepath2)

selected_feature_names_attraction = ["sust_amt","educ_amt","tran_amt","heal_amt","fina_amt","comm_amt", "ente_amt","othe_amt",
                        "comm_area", "park_area","indu_area","agri_area","resi_area","util_area","othe_area"]
#fitted_production_model = regression.getFittedRegressionModel(selected_feature_names, dep_col_name)
#fitted_production_model_attr = regression2.getFittedRegressionModel(selected_feature_names_attraction, dep_col_name)
##

selected_feature_names = ["no_hh","avg_income","mem_no","no_mem_educwork"]
landuse_names = ["comm_area","park_area", "indu_area","agri_area",
                         "resi_area", "util_area"]
totalrmse = 0
print(selected_feature_names)
X_vals = regression.getXVals(selected_feature_names+landuse_names)
Y_vals = regression.getYVals(dep_col_name)
best_comb = []
best_score = 0
initialized = False

for L in range(1, len(selected_feature_names+landuse_names)+1):
  for subset in itertools.combinations(selected_feature_names+landuse_names, L):
    curr_cols = list(subset)
    print("current: "+str(curr_cols))
    totalrmse = 0
    for x in range(len(X_vals.index)):
        Predicted_vals = []
        #print("Test row: "+str(x))
        fitted_production_model2 = regression.kFoldTest(curr_cols, dep_col_name, x);
        #testdframe.head()
        #fitted_production_model2.predict(X_vals.iloc[x , :].values.reshape(1,-1))
        for z in range(len(X_vals.index)):
            #print("Actual Zone "+str(z)+": "+str(Y_vals.iloc[z]))
            #print("Predicted Zone "+str(z)+": "+str(fitted_production_model2.predict(X_vals.iloc[z , :].values.reshape(1,-1))[0]))
            #print("subset"+str(list(subset)))
            Predicted_vals.append(fitted_production_model2.predict(X_vals.iloc[z , :].loc[curr_cols].values.reshape(1,-1))[0])
        rms = sqrt(mean_squared_error(Y_vals.values, Predicted_vals))
        totalrmse += rms
        #print("Trained using index "+str(x)+": RMSE = "+str(rms))
    print("TOTAL RMSE: "+str(totalrmse))
    if(initialized == False):
        best_score = totalrmse
        initialized = True 
        best_comb = curr_cols
    else:
        if(totalrmse < best_score):
            best_score = totalrmse
            best_comb = curr_cols
print("\n \nBest Score: "+str(best_score))
print("Best Combination: "+str(best_comb)) 


## TRIP GENERATION TESTING #6:
Examine the effect of increasing/decreasing the number of each of the types of landuse per zone, and verify its effect to the number of trips produced and attracted per zone

## Base/Original data 

In [None]:
dep_col_name = "trips"
data_filepath = "DummyDataRegression.csv"
regression = Regression(data_filepath)

selected_feature_names = ["no_hh","avg_income","mem_no","no_mem_educwork","comm_area","park_area", "indu_area","agri_area",
                         "resi_area", "util_area","othe_area"]
fitted_production_model = regression.getFittedRegressionModel(selected_feature_names, dep_col_name)

dummy_data = pd.read_csv('DummyDataRegression.csv', index_col=0)
dummy_data.head()

## Case 1:
Using the 1st row of the original data (zone 1), increase the number of residential areas, and check its effect to the resulting trips produced.

In [None]:
data_filepath2 = "DummyDataRegressionEXP6.csv"
regression2 = Regression(data_filepath2)
dummy_data = pd.read_csv(data_filepath2, index_col=0)
dummy_data.head()

## Case 1 Result:
The predicted number of trips produced from zone 1, given the original number of residential areas.

In [None]:
X_vals = regression.getXVals(selected_feature_names)
fitted_production_model.predict(X_vals.iloc[0 , :].values.reshape(1,-1))[0]

The predicted number of trips produced from zone 1, given that the number of residential areas were increased.

In [None]:
X_vals = regression2.getXVals(selected_feature_names)
fitted_production_model.predict(X_vals.iloc[0 , :].values.reshape(1,-1))[0]

# Scatter plots for the zonal trips produced of other studies

## 1.) HHIS from MUCEP

In [None]:
fig = plt.figure(figsize=(14,8))
ax1 = fig.add_subplot(221)
ax1.grid()
hhisSum = sum(Y_vals.values)
mark1 = ax1.plot(range(0, 17), Y_vals.values, label="DOTR Zonal Trips", ls="dotted", marker="o")
ax1.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.show()

## 2.) A study based on MUCEP

In [None]:
FILONE_MUCEP_TRIP_PROD = [326411,93639,535638,63112,231470,608551,84204,157800,29288,200924,153585
                        ,159682,7882,1313553,37197,134726,120622]
filoneSum = sum(FILONE_MUCEP_TRIP_PROD)
ratio = hhisSum/filoneSum
FILONE_MUCEP_TRIP_PROD = [x * ratio for x in FILONE_MUCEP_TRIP_PROD]
print(str(sum(FILONE_MUCEP_TRIP_PROD)) + " " + str(hhisSum))
fig = plt.figure(figsize=(14,8))
ax1 = fig.add_subplot(221)
ax1.grid()
mark1 = ax1.plot(range(0, 17), FILONE_MUCEP_TRIP_PROD, label="Filone Stury Zonal Trips", ls="dotted", marker="o")
ax1.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.show()
difference = myFormattedList = [ round(elem, 1) for elem in (FILONE_MUCEP_TRIP_PROD - Y_vals.values)]
print(Y_vals.values)
print(FILONE_MUCEP_TRIP_PROD)
print([ round(elem, 1) for elem in FILONE_MUCEP_TRIP_PROD - Y_vals.values ])


# SCRATCH CELLS:

In [None]:
dep_col_name = "trips"
data_filepath = "DummyDataRegression.csv"
regression = Regression(data_filepath)

data_filepath2 = "DummyDataRegressionAmenitiex.csv"
regression2 = Regression(data_filepath2)

selected_feature_names = ["no_hh","avg_income","mem_no","no_mem_educwork","comm_area","park_area", "indu_area","agri_area",
                         "resi_area", "util_area","othe_area"]
selected_feature_names_attraction = ["sust_amt","educ_amt","tran_amt","heal_amt","fina_amt","comm_amt", "ente_amt","othe_amt",
                        "comm_area", "park_area","indu_area","agri_area","resi_area","util_area","othe_area"]
fitted_production_model = regression.getFittedRegressionModel(selected_feature_names, dep_col_name)
fitted_production_model_attr = regression2.getFittedRegressionModel(selected_feature_names_attraction, dep_col_name)

print("Prod Intercept:"+str(fitted_production_model.intercept_)+" Coef:"+str(fitted_production_model.coef_))
print("Attr Intercept:"+str(fitted_production_model_attr.intercept_)+" Coef:"+str(fitted_production_model_attr.coef_))

dummy_data = pd.read_csv('DummyDataRegression.csv', index_col=0)
dummy_data.head()

In [None]:
totalrmse = 0
print(selected_feature_names)
X_vals = regression.getXVals(selected_feature_names)
Y_vals = regression.getYVals(dep_col_name)

for x in range(len(X_vals.index)):
    Predicted_vals = []
    print("Test row: "+str(x))
    fitted_production_model2 = regression.kFoldTest(selected_feature_names, dep_col_name, x);
    #testdframe.head()
    #fitted_production_model2.predict(X_vals.iloc[x , :].values.reshape(1,-1))
    for z in range(len(X_vals.index)):
        #print("Actual Zone "+str(z)+": "+str(Y_vals.iloc[z]))
        #print("Predicted Zone "+str(z)+": "+str(fitted_production_model2.predict(X_vals.iloc[z , :].values.reshape(1,-1))[0]))
        Predicted_vals.append(fitted_production_model2.predict(X_vals.iloc[z , :].values.reshape(1,-1))[0])
    rms = sqrt(mean_squared_error(Y_vals.values, Predicted_vals))
    totalrmse += rms
    print("Trained using index "+str(x)+": RMSE = "+str(rms))
print("TOTAL RMSE: "+str(totalrmse))

In [None]:
dep_col_name = "trips"
data_filepath = "DummyDataRegression.csv"
regression = Regression(data_filepath)

data_filepath2 = "DummyDataRegressionAmenitiex.csv"
regression2 = Regression(data_filepath2)

selected_feature_names = ["no_hh","avg_income","mem_no","no_mem_educwork","comm_area","park_area", "indu_area","agri_area",
                         "resi_area", "util_area","othe_area"]
selected_feature_names_attraction = ["sust_amt","educ_amt","tran_amt","heal_amt","fina_amt","comm_amt", "ente_amt","othe_amt",
                        "comm_area", "park_area","indu_area","agri_area","resi_area","util_area","othe_area"]
fitted_production_model = regression.getFittedRegressionModel(selected_feature_names, dep_col_name)
fitted_production_model_attr = regression2.getFittedRegressionModel(selected_feature_names_attraction, dep_col_name)

#print("Prod Intercept:"+str(fitted_production_model.intercept_)+" Coef:"+str(fitted_production_model.coef_))
#print("Attr Intercept:"+str(fitted_production_model_attr.intercept_)+" Coef:"+str(fitted_production_model_attr.coef_))

dummy_data = pd.read_csv('DummyDataRegression.csv', index_col=0)
dummy_data.head()