In [8]:
import pandas as pd
import numpy as np
import scipy as sp
from geopy import distance
from shapely.geometry import Point
from shapely.geometry.polygon import Polygon


In [9]:
def taxi_data_light():
    col_datatypes = {'fare_amount': 'float32',
              'pickup_datetime': 'str', 
              'pickup_longitude': 'float32',
              'pickup_latitude': 'float32',
              'dropoff_longitude': 'float32',
              'dropoff_latitude': 'float32',
              'passenger_count': 'uint8'}
    
    cols = list(col_datatypes.keys())
    
    n = 3
    X_light = pd.read_csv('train.csv', dtype=col_datatypes, header=0, usecols=cols,
                         skiprows= lambda i : i % n != 0)
    X_light['pickup_datetime'] = X_light['pickup_datetime'].str.slice(0, 16)
    X_light['pickup_datetime'] = pd.to_datetime(X_light['pickup_datetime'], utc=True, format='%Y-%m-%d %H:%M')
    
    
    print(X_light.info())
    
    
    return X_light

In [10]:
def taxi_data_testset():
    col_datatypes = {
              'pickup_datetime': 'str', 
              'pickup_longitude': 'float32',
              'pickup_latitude': 'float32',
              'dropoff_longitude': 'float32',
              'dropoff_latitude': 'float32',
              'passenger_count': 'uint8',
                'key' : 'str'}
    cols = list(col_datatypes.keys())
    
    X_test = pd.read_csv('test.csv', dtype=col_datatypes, header=0, usecols=cols)
    

    
    return X_test

In [11]:
def taxi_data_full():
    col_datatypes = {'fare_amount': 'float32',
              'pickup_datetime': 'str', 
              'pickup_longitude': 'float32',
              'pickup_latitude': 'float32',
              'dropoff_longitude': 'float32',
              'dropoff_latitude': 'float32',
              'passenger_count': 'uint8'}
    
    cols = list(col_datatypes.keys())
    
    X_full = pd.read_csv('train.csv', dtype=col_datatypes, header=0, usecols=cols)
    
    X_full['pickup_datetime'] = X_full['pickup_datetime'].str.slice(0, 16)
    X_full['pickup_datetime'] = pd.to_datetime(X_full['pickup_datetime'], utc=True, format='%Y-%m-%d %H:%M')

    #print(len(X_full[X_full['pickup_latitude'] > 45].index.tolist()))

    
    return X_full


In [12]:
def nyc_mapping():
    #lat / lon vertices of Manhattan_polygon
    v0 = [40.697509, -74.011890]
    v1 = [40.701999, -74.023220]
    v2 = [40.756686, -74.013898]
    v3 = [40.828630, -73.962295]
    v4 = [40.881317, -73.934646]
    v5 = [40.872590, -73.909950]
    v6 = [40.835231, -73.933841]
    v7 = [40.809128, -73.933438]
    v8 = [40.800289, -73.927667]
    v9 = [40.774376, -73.939880]
    v10 = [40.738207, -73.967730]
    v11 = [40.708912, -73.974576]
    v12 = [40.704944, -73.998735]
    
    vx = [v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12]
    Manhattan_polygon = Polygon(vx)

    #lat / lon vertices of JFK Airport
    
    jf1 = [40.622718, -73.770487]
    jf2 = [40.648282, -73.829434]
    jf3 = [40.666631, -73.833526]
    jf4 = [40.669141, -73.801063]
    jf5 = [40.668668, -73.781136]
    jf6 = [40.640110, -73.736205]
    
    jfx = [jf1, jf2, jf3, jf4, jf5, jf6]
    
    JFK_polygon = Polygon(jfx)
    
    #lat / lon vertices of Newark airport
    
    n1 = [40.663831, -74.179334]
    n2 = [40.680684, -74.195557]
    n3 = [40.691124, -74.198349]
    n4 = [40.712151, -74.181308]
    n5 = [40.709275, -74.148121]
    n6 = [40.687747, -74.159393]
    
    nx = [n1, n2, n3, n4, n5, n6]
    
    Newark_polygon = Polygon(nx)
    

    #testy = Point(40.831296, -73.923056) #test of location in Bronx
    #testjfk = Point(40.646945, -73.789158) #test of location at JFK
    
    #print(Manhattan_polygon.contains(testy))
    #print(JFK_polygon.contains(testjfk))
    locations = [Manhattan_polygon, JFK_polygon, Newark_polygon]


    
    return locations    
    

In [13]:
def loc_mapping(lat, lon):
    Manhattan_polygon = nyc_mapping()[0]
    JFK = nyc_mapping()[1]
    Newark = nyc_mapping()[2]


    location = 'unknown'
    
    if Manhattan_polygon.contains(Point(lat, lon)):
        location = 'Manhattan'
        
    if location == 'unknown' and JFK.contains(Point(lat, lon)):
        location = 'JFK'
    
    if location == 'unknown' and Newark.contains(Point(lat, lon)):
        location = 'Newark'
    
    return location
    

In [None]:
def taxi_fare():
    from sklearn.compose import ColumnTransformer
    from sklearn.pipeline import Pipeline
    from sklearn.impute import SimpleImputer
    from sklearn.preprocessing import OneHotEncoder
    from sklearn.metrics import mean_squared_error
    from sklearn.model_selection import train_test_split
    from sklearn.ensemble import GradientBoostingRegressor
    from xgboost import XGBRegressor
    from sklearn.model_selection import cross_val_score
    import scipy as sp
    from pandarallel import pandarallel

    pandarallel.initialize()


    X_test_full = taxi_data_testset()
    
    X_test_full_unmod = X_test_full.copy()
    
    X_test_full['pickup_datetime'] = X_test_full['pickup_datetime'].str.slice(0, 16)
    X_test_full['pickup_datetime'] = pd.to_datetime(X_test_full['pickup_datetime'], utc=True, format='%Y-%m-%d %H:%M')
    
    X_test_full['dist'] = X_test_full.parallel_apply(lambda x: distance.distance((x.pickup_latitude, x.pickup_longitude), (x.dropoff_latitude, x.dropoff_longitude)).km, axis=1)
    X_test_full['Pickup_area'] = X_test_full.parallel_apply(lambda x: loc_mapping(x.pickup_latitude, x.pickup_longitude), axis=1)
    X_test_full['Dropoff_area'] = X_test_full.parallel_apply(lambda x: loc_mapping(x.dropoff_latitude, x.dropoff_longitude), axis=1)
    X_test_full['weekday'] = X_test_full['pickup_datetime'].dt.dayofweek
    X_test_full['hour'] = X_test_full['pickup_datetime'].dt.hour
    X_test_full['year'] = X_test_full['pickup_datetime'].dt.year
    X_test_full.drop(['pickup_datetime'], axis=1,inplace=True)
    X_test_full.drop(['key'], axis=1,inplace=True)    




    #X = taxi_data_full() 
    X = taxi_data_light()
    cols = ['pickup_longitude','pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']

    for col in cols:
        X[col] = X[col].astype(float).round(4)
        
    #drop rows with deviating values and NaN
    X.drop(X[(X['pickup_latitude'] > 45) | (X['dropoff_latitude'] > 45) 
                      | (X['pickup_latitude'] < 35) | (X['dropoff_latitude'] < 35)].index.tolist(), inplace=True)
    
    X.drop(X[(X['pickup_longitude'] > -70) | (X['dropoff_longitude'] > -70) 
                      | (X['pickup_longitude'] < -76) | (X['dropoff_longitude'] < -76)].index.tolist(), inplace=True)
    
    X.dropna(subset=['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude'], axis=0, inplace=True)
    
    
    X['cityblock_dist'] = X.parallel_apply(lambda x: sp.spatial.distance.cityblock((x.pickup_latitude, x.pickup_longitude), (x.dropoff_latitude, x.dropoff_longitude)), axis=1)
    X['dist'] = X.parallel_apply(lambda x: distance.distance((x.pickup_latitude, x.pickup_longitude), (x.dropoff_latitude, x.dropoff_longitude)).km, axis=1)
    X['Pickup_area'] = X.parallel_apply(lambda x: loc_mapping(x.pickup_latitude, x.pickup_longitude), axis=1)
    X['Dropoff_area'] = X.parallel_apply(lambda x: loc_mapping(x.dropoff_latitude, x.dropoff_longitude), axis=1)
    X['weekday'] = X['pickup_datetime'].dt.dayofweek
    X['hour'] = X['pickup_datetime'].dt.hour
    X['year'] = X['pickup_datetime'].dt.year



    
    #X['cityblock_dist'] = sp.spatial.distance.cityblock(X.puloc, X.doloc)
    
    print(X)

    #baseline
    #sample_submission = pd.read_csv('sample_submission.csv')

    cols_with_missing = [col for col in X.columns
                     if X[col].isnull().any()]
    
    print('Missing value columns:', cols_with_missing)
    
    y=X.fare_amount
    X.drop(['fare_amount'], axis=1,inplace=True)
    X.drop(['pickup_datetime'], axis=1,inplace=True)    


    
    # Preprocessing for numerical data
    numerical_transformer = SimpleImputer(strategy='median')
    
    # Preprocessing for categorical data
    categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
    
    numerical_cols = ['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'dist', 'weekday', 'hour', 'year']
    categorical_cols = ['Pickup_area', 'Dropoff_area']
    

    #Bundle preprocessing for numerical and categorical data
    preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
     ])
    
    
    # Keep selected columns only
    my_cols = categorical_cols + numerical_cols
    X_red = X[my_cols].copy()
    X_test_kaggle = X_test_full[my_cols].copy()
    
    X_train, X_test, y_train, y_test = train_test_split(X_red, y, test_size=0.2, random_state=0)
    
    eval_set = [(X_test, y_test)]
    
    model = XGBRegressor(n_estimators=200, learning_rate=0.1, n_jobs=4, objective="reg:squarederror", eval_metric = 'rmse', eval_set = eval_set, early_stopping_rounds = 10, predictor = 'cpu_predictor')
        
    # Bundle preprocessing and modeling code in a pipeline
    my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)])
    
    # Preprocessing of training data, fit model 
    my_pipeline.fit(X_train, y_train)
    
    y_pred = my_pipeline.predict(X_test)
    print('MSE:', mean_squared_error(y_test, y_pred))
    
    #scores = cross_val_score(my_pipeline, X_red, y,
    #                          cv=5,
    #                          scoring='neg_mean_squared_error', n_jobs=4)
    #print('Neg MSE:', scores)
    #print("NEG MSE mean:", scores.mean())
    #print('Standard deviation:', scores.std())

    test_pred = my_pipeline.predict(X_test_kaggle)

    answer = pd.DataFrame(data=X_test_full_unmod.key, columns=['key'])
    answer['fare_amount'] = test_pred

    answer.to_csv('answern3_new', index=False)
    return
taxi_fare()

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18474618 entries, 0 to 18474617
Data columns (total 7 columns):
fare_amount          float32
pickup_datetime      datetime64[ns, UTC]
pickup_longitude     float32
pickup_latitude      float32
dropoff_longitude    float32
dropoff_latitude     float32
passenger_count      uint8
dtypes: datetime64[ns, UTC](1), float32(5), uint8(1)
memory usage: 510.9 MB
None
          fare_amount           pickup_datetime  pickup_longitude  \
0                 5.7 2011-08-18 00:35:00+00:00          -73.9827   
1                12.1 2011-01-06 09:50:00+00:00          -74.0010   
2                 9.0 2012-12-03 13:10:00+00:00          -74.0065   
4                 7.7 2011-04-05 17:11:00+00:00          -74.0018   
5                 5.3 2009-07-22 16:08:00+00:00          -73.9811   
...      

  if getattr(data, 'base', None) is not None and \




In [None]:
Xa = pd.DataFrame(np.array([[-73.9885, 40.7584, -73.9838, 40.7301], [-73.9918, 40.7262, -73.9877, 40.7392]]),
                     columns = ['pickup_longitude','pickup_latitude', 'dropoff_longitude', 'dropoff_latitude'])
print(Xa)

#Xa['puloc'] = pd.Series(zip(Xa.pickup_latitude, Xa.pickup_longitude))
#Xa['doloc'] = pd.Series(zip(Xa.dropoff_latitude, Xa.dropoff_longitude))

print(Xa)
print(Xa.info())

Xadist = distance.distance((40.7584, -73.9885), (40.7301, -73.9838))
print(Xadist)

cityblock_dist = sp.spatial.distance.cityblock((40.7584, -73.9885), (40.7301, -73.9838))
print(cityblock_dist)


    #X['cityblock_dist'] = sp.spatial.distance.cityblock(X.puloc, X.doloc)




Xa['dist'] = Xa.apply(lambda x: distance.distance((x.pickup_latitude, x.pickup_longitude), (x.dropoff_latitude, x.dropoff_longitude)).km, axis=1)
Xa['cityblock_dist'] = Xa.apply(lambda x: sp.spatial.distance.cityblock((x.pickup_latitude, x.pickup_longitude), (x.dropoff_latitude, x.dropoff_longitude)), axis=1)


Xa['Pickup_area'] = Xa.apply(lambda x: loc_mapping(x.pickup_latitude, x.pickup_longitude), axis=1)
Xa['Dropoff_area'] = Xa.apply(lambda x: loc_mapping(x.dropoff_latitude, x.dropoff_longitude), axis=1)





print(Xa)


In [None]:
import sklearn
sorted(sklearn.metrics.SCORERS.keys())

In [None]:
!pip install pandarallel