In [1]:
from shapely.geometry import Point
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import folium
from folium.plugins import MarkerCluster
from folium.plugins import FastMarkerCluster

In [2]:
from sqlalchemy import create_engine, text

In [3]:
database_name = 'Scooters'

connection_string = f"postgresql://postgres:postgres@localhost:5432/Scooters"

In [4]:
engine = create_engine(connection_string)

# trips table
### EDA 
#### row count

In [5]:
query = '''
SELECT COUNT(*)
FROM trips;
'''


In [6]:
with engine.connect() as connection:    
    trips_count = pd.read_sql(text(query), con = connection)
trips_count

Unnamed: 0,count
0,565522


# create_dt
### EDA 
Create_dt ranges from 2019-05-02 to 2019-08-02.
Timestamps start two days after the end of a month period.

#### Q. Is create_dt needed for analysis of neg/pos/0 value duration and distance patterns?
Should it be dropped from cleaned table?

In [7]:
query = '''
SELECT create_dt
FROM trips;
'''

In [8]:
#for EDA on create_dt - what does this column show related to other timestamps?
with engine.connect() as connection:    
    create_dt = pd.read_sql(text(query), con = connection)
create_dt

Unnamed: 0,create_dt
0,2019-07-07 05:32:06.343
1,2019-07-07 05:32:06.590
2,2019-07-07 05:32:06.223
3,2019-07-07 05:32:06.557
4,2019-07-07 09:52:19.980
...,...
565517,2019-07-07 10:42:07.690
565518,2019-07-07 05:32:06.253
565519,2019-07-07 05:32:05.957
565520,2019-07-07 05:32:06.313


# CU3 - EDA
### tripduration
NOTE: Under the 24-hour clock system, the day begins at midnight, 00:00, and the last minute of the day begins at 23:59 and ends at 24:00, which is identical to 00:00 of the following day. 12:00 can only be mid-day. Midnight is called 24:00 and is used to mean the end of the day and 00:00 is used to mean the beginning of the day. https://simple.wikipedia.org/wiki/24-hour_clock

#### tripduration - MAX: 512619.0, MIN: -19.358267

#### Review neg values, 0 values, nulls, relative to other time-related columns to see if there is a pattern
Relates to: Q2, Are scooter companies in compliance? 
Are we able to determine what might be staff servicing and test trips?

#### Summary of negative value EDA on tripduration
There are 8 total negative tripdurations, all with positive trip distance.  They occured on just two dates: 2019-06-21 and 2019-07-18. Did not find any meaningful events happening on those dates in Nashville.  All trips were less than a mile (4540 ft = .86 mile), and the majority of them were initiatiated late at night or just after midnight.  Start date/times and end times do not seem correct on all of them related to trip duration, some are going backwards.z

Q. Are these 8 negative values outliers due to some kind of system error, or refunds, as Dani suggested?

Q. Should our compiled table have negative values removed on trip distance?

In [9]:
#pull date/time columns for analysis with a negative tripduration
query = '''
SELECT tripduration, tripdistance, startdate, starttime, enddate, endtime, create_dt 
FROM trips
WHERE tripduration < 0;
'''

In [10]:
with engine.connect() as connection:    
    neg_duration = pd.read_sql(text(query), con = connection)

In [11]:
neg_duration

Unnamed: 0,tripduration,tripdistance,startdate,starttime,enddate,endtime,create_dt
0,-0.715917,2214.567,2019-07-18,23:50:34.650000,2019-07-18,23:49:51.693333,2019-07-19 10:49:30.810
1,-10.242417,52.49344,2019-07-18,23:59:35.683333,2019-07-18,23:49:21.136666,2019-07-20 10:52:39.020
2,-0.501317,3799.21272,2019-07-18,23:49:45.476666,2019-07-18,23:49:15.396666,2019-07-20 10:52:39.223
3,-4.618833,3061.02372,2019-07-18,23:53:53.926666,2019-07-18,23:49:16.796666,2019-07-20 10:52:39.343
4,-10.9751,3641.7324,2019-07-19,00:00:24.016666,2019-07-18,23:49:25.513333,2019-07-20 10:52:39.657
5,-19.358267,4540.68256,2019-07-19,00:09:04.506666,2019-07-18,23:49:43.013333,2019-07-20 10:52:39.737
6,-8.003717,3484.25208,2019-06-21,21:32:09.170000,2019-06-21,21:24:08.946666,2019-06-22 08:31:51.090
7,-1.359867,3166.0106,2019-06-21,22:23:01.316666,2019-06-21,22:21:39.726666,2019-06-22 08:31:56.090


In [12]:
#count negative tripduration entries
query = '''
SELECT COUNT(tripduration)
FROM trips
WHERE tripduration < 0;
'''

In [13]:
with engine.connect() as connection:    
    count_negative_td = pd.read_sql(text(query), con = connection)
count_negative_td

Unnamed: 0,count
0,8


#### Summary of nulls EDA on tripduration
There are no null values in tripduration.

In [14]:
#count null tripdurations
query = '''
SELECT COUNT(tripduration)
FROM trips
WHERE tripduration IS NULL;
'''

In [15]:
with engine.connect() as connection:    
    count_nulls = pd.read_sql(text(query), con = connection)
count_nulls

Unnamed: 0,count
0,0


#### Summary of 0.00 values in  tripduration
There are 4624 entries with 0.00 in tripduration.

Q. Why would there be zero values in trip duration?  Could these be due to servicing of the scooters? 

In [16]:
#count zero tripdurations
query = '''
SELECT COUNT(tripduration)
FROM trips
WHERE tripduration = 0;
'''

In [17]:
with engine.connect() as connection:    
    count_zero = pd.read_sql(text(query), con = connection)
count_zero

Unnamed: 0,count
0,4624


In [18]:
#pull zero tripdurations
query = '''
SELECT tripduration
FROM trips
WHERE tripduration = 0;
'''

In [19]:
with engine.connect() as connection:    
    pull_zero = pd.read_sql(text(query), con = connection)
pull_zero

Unnamed: 0,tripduration
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0
...,...
4619,0.0
4620,0.0
4621,0.0
4622,0.0


#### Summary of positive values in  tripduration
There are 560890 entries with positive values in tripduration.

In [20]:
#count tripdurations with positive numbers
query = '''
SELECT COUNT(tripduration)
FROM trips
WHERE tripduration > 0;
'''

In [21]:
with engine.connect() as connection:    
    zero_distance = pd.read_sql(text(query), con = connection)
zero_distance

Unnamed: 0,count
0,560890


#### Summary of positive values in  tripduration
There are 560890 entries with positive values in tripduration.

# CU3 - EDA
### tripdistance

#### tripdistance - MAX: 3.188448e+07, MIN: -20324803.8 
Q. What does the "e+07" etc. indicate in tripduration?   
A. Tripdistance is formatted in scientific notation

RE: Scientific notation: means any number expressed in the power of 10.for example- 340 can be written in scientific notation as 3.4 X102.in pythons, we use str.format() on a number with “{:e}” to format the number to scientific notation. str.format() formats the number as a float, followed by “e+” and the appropriate power of 10. For example- 340 will be displayed as 3.4e+2 https://www.geeksforgeeks.org/display-scientific-notation-as-float-in-python/

#### Summary of negative value EDA on tripduration
There are 32 negative values in tripdistance.  These indicate failure to comply with regulations to only include trips greater than one minute.

They consistently fall within the 1st 5, middle 3, or last 5 days of the month, which could be related to regular staff servicing or testing.  The starttimes mostly fall early in the morning before morning rush hour, or after the evening rush, which again, may indicate staff servicing or testing times. Trip duration are all evenly rounded, which may indicate a tag for tracking staff service or testing time.

There 4305 entries with 0.0 tripdistance and 0.0 tripduration.  We could try grouping these by startdate and starttime to look at any patterns to see if these also may be related to staff service, testing times, or system errors.

There seem to be issues sorting on scientific notation both through SQL code and with Python.

In [46]:
#count tripdistance negative values
query = '''
SELECT COUNT(tripdistance) AS neg_value_td
FROM trips
WHERE tripdistance < 0;
'''

In [47]:
with engine.connect() as connection:    
    cnt_neg_distance = pd.read_sql(text(query), con = connection)
cnt_neg_distance

Unnamed: 0,neg_value_td
0,32


In [24]:
#pull date/time columns for analysis with a negative tripdistance, sorted by startdate
query = '''
SELECT tripduration, tripdistance, startdate, starttime, enddate, endtime, create_dt 
FROM trips
WHERE tripdistance < 0
ORDER BY startdate;
'''

In [25]:
with engine.connect() as connection:    
    pull_neg_distance = pd.read_sql(text(query), con = connection)
pull_neg_distance

Unnamed: 0,tripduration,tripdistance,startdate,starttime,enddate,endtime,create_dt
0,3.0,-1119964.0,2019-05-03,03:36:18.613333,2019-05-03,03:39:28.660000,2019-05-04 05:30:38.683
1,2.0,-1685315.0,2019-05-04,20:40:10.926666,2019-05-04,20:42:18.600000,2019-05-05 05:31:48.947
2,3.0,-1684701.0,2019-05-04,03:23:54.140000,2019-05-04,03:26:29.310000,2019-05-05 05:30:42.633
3,2.0,-1684806.0,2019-05-04,17:44:47.986666,2019-05-04,17:46:26.800000,2019-05-05 05:31:18.497
4,1.0,-1684971.0,2019-05-04,18:20:53.513333,2019-05-04,18:21:46.796666,2019-05-05 05:31:23.903
5,60.0,-2129.265,2019-05-05,15:51:56.323333,2019-05-05,16:51:37.006666,2019-05-06 05:31:09.407
6,3.0,-62969.16,2019-05-14,18:25:49.013333,2019-05-14,18:28:26.620000,2019-05-15 05:31:25.173
7,20.0,-3280.84,2019-05-14,19:42:58.320000,2019-05-14,20:03:13.833333,2019-05-15 05:31:36.630
8,22.0,-3608.924,2019-05-14,19:40:56.146666,2019-05-14,20:03:06.506666,2019-05-15 05:31:36.690
9,3.0,-2624.672,2019-05-14,19:34:54.086666,2019-05-14,19:38:12.580000,2019-05-15 05:31:34.027


In [26]:
#pull date/time columns for analysis with a negative tripdistance, sorted by starttime
query = '''
SELECT tripduration, tripdistance, startdate, starttime, enddate, endtime, create_dt 
FROM trips
WHERE tripdistance < 0
ORDER BY starttime;
'''

In [27]:
with engine.connect() as connection:    
    pull_neg_distance = pd.read_sql(text(query), con = connection)
pull_neg_distance

Unnamed: 0,tripduration,tripdistance,startdate,starttime,enddate,endtime,create_dt
0,13.0,-328.084,2019-06-15,01:14:37.736666,2019-06-15,01:27:57.336666,2019-06-16 05:30:47.750
1,25.0,-9337271.0,2019-07-26,01:55:59.876666,2019-07-26,02:20:39.063333,2019-07-27 05:30:28.807
2,11.0,-328.084,2019-06-15,01:56:10.866666,2019-06-15,02:07:37.036666,2019-06-16 05:30:53.193
3,11.0,-5905.512,2019-06-15,01:56:51.610000,2019-06-15,02:07:32.220000,2019-06-16 05:30:53.127
4,13.0,-1968.504,2019-06-15,01:58:31.310000,2019-06-15,02:11:26.276666,2019-06-16 05:30:53.957
5,11.0,-11811.02,2019-06-15,02:11:03.360000,2019-06-15,02:21:35.666666,2019-06-16 05:30:55.063
6,6.0,-328.084,2019-06-15,02:20:26.330000,2019-06-15,02:26:39.170000,2019-06-16 05:30:55.537
7,2.0,-656.168,2019-06-15,02:24:56.456666,2019-06-15,02:27:05.510000,2019-06-16 05:30:55.443
8,3.0,-1684701.0,2019-05-04,03:23:54.140000,2019-05-04,03:26:29.310000,2019-05-05 05:30:42.633
9,3.0,-1119964.0,2019-05-03,03:36:18.613333,2019-05-03,03:39:28.660000,2019-05-04 05:30:38.683


In [28]:
#pull date/time columns for analysis with a negative tripdistance, sorted by tripduration
query = '''
SELECT tripduration, tripdistance, startdate, starttime, enddate, endtime, create_dt 
FROM trips
WHERE tripdistance < 0
ORDER BY tripduration;
'''

In [29]:
with engine.connect() as connection:    
    pull_neg_distance = pd.read_sql(text(query), con = connection)
pull_neg_distance

Unnamed: 0,tripduration,tripdistance,startdate,starttime,enddate,endtime,create_dt
0,1.0,-1684971.0,2019-05-04,18:20:53.513333,2019-05-04,18:21:46.796666,2019-05-05 05:31:23.903
1,2.0,-1684806.0,2019-05-04,17:44:47.986666,2019-05-04,17:46:26.800000,2019-05-05 05:31:18.497
2,2.0,-178773.0,2019-05-18,22:21:10.570000,2019-05-18,22:22:52.456666,2019-05-19 05:32:05.863
3,2.0,-656.168,2019-06-15,02:24:56.456666,2019-06-15,02:27:05.510000,2019-06-16 05:30:55.443
4,2.0,-2253937.0,2019-05-25,22:17:09.306666,2019-05-25,22:19:32.360000,2019-05-26 05:32:32.830
5,2.0,-1685315.0,2019-05-04,20:40:10.926666,2019-05-04,20:42:18.600000,2019-05-05 05:31:48.947
6,3.0,-984.252,2019-05-14,21:11:02.400000,2019-05-14,21:13:41.736666,2019-05-15 05:31:43.310
7,3.0,-62969.16,2019-05-14,18:25:49.013333,2019-05-14,18:28:26.620000,2019-05-15 05:31:25.173
8,3.0,-2624.672,2019-05-14,19:34:54.086666,2019-05-14,19:38:12.580000,2019-05-15 05:31:34.027
9,3.0,-1119964.0,2019-05-03,03:36:18.613333,2019-05-03,03:39:28.660000,2019-05-04 05:30:38.683


In [30]:
#pull date/time columns for analysis with a negative tripdistance, sorted by tripdistance in SQL
query = '''
SELECT tripduration, tripdistance, startdate, starttime, enddate, endtime, create_dt 
FROM trips
WHERE tripdistance < 0
ORDER BY tripdistance;
'''

In [31]:
with engine.connect() as connection:    
    pull_neg_distance = pd.read_sql(text(query), con = connection)
pull_neg_distance.head(10)

Unnamed: 0,tripduration,tripdistance,startdate,starttime,enddate,endtime,create_dt
0,44.0,-20324800.0,2019-07-27,23:14:23.683333,2019-07-27,23:58:23.820000,2019-07-28 05:32:05.610
1,9.0,-19900920.0,2019-07-25,16:31:14.886666,2019-07-25,16:40:29.046666,2019-07-26 05:31:03.980
2,25.0,-9337271.0,2019-07-26,01:55:59.876666,2019-07-26,02:20:39.063333,2019-07-27 05:30:28.807
3,5.0,-2758530.0,2019-05-17,21:01:20.166666,2019-05-17,21:06:33.026666,2019-05-18 05:31:26.960
4,2.0,-2253937.0,2019-05-25,22:17:09.306666,2019-05-25,22:19:32.360000,2019-05-26 05:32:32.830
5,2.0,-1685315.0,2019-05-04,20:40:10.926666,2019-05-04,20:42:18.600000,2019-05-05 05:31:48.947
6,1.0,-1684971.0,2019-05-04,18:20:53.513333,2019-05-04,18:21:46.796666,2019-05-05 05:31:23.903
7,2.0,-1684806.0,2019-05-04,17:44:47.986666,2019-05-04,17:46:26.800000,2019-05-05 05:31:18.497
8,3.0,-1684701.0,2019-05-04,03:23:54.140000,2019-05-04,03:26:29.310000,2019-05-05 05:30:42.633
9,3.0,-1119964.0,2019-05-03,03:36:18.613333,2019-05-03,03:39:28.660000,2019-05-04 05:30:38.683


In [32]:
#pull zero tripdurations relative to other time and datestamps, sorted by tripdistance in Python
query = '''
SELECT tripduration, tripdistance, startdate, starttime, enddate, endtime, create_dt 
FROM trips
WHERE tripdistance < 0;
'''

In [33]:
with engine.connect() as connection:    
    pull_neg_distance = pd.read_sql(text(query), con = connection)
pull_neg_distance.sort_values(by = 'tripdistance').head(10)

Unnamed: 0,tripduration,tripdistance,startdate,starttime,enddate,endtime,create_dt
2,44.0,-20324800.0,2019-07-27,23:14:23.683333,2019-07-27,23:58:23.820000,2019-07-28 05:32:05.610
0,9.0,-19900920.0,2019-07-25,16:31:14.886666,2019-07-25,16:40:29.046666,2019-07-26 05:31:03.980
1,25.0,-9337271.0,2019-07-26,01:55:59.876666,2019-07-26,02:20:39.063333,2019-07-27 05:30:28.807
26,5.0,-2758530.0,2019-05-17,21:01:20.166666,2019-05-17,21:06:33.026666,2019-05-18 05:31:26.960
17,2.0,-2253937.0,2019-05-25,22:17:09.306666,2019-05-25,22:19:32.360000,2019-05-26 05:32:32.830
8,2.0,-1685315.0,2019-05-04,20:40:10.926666,2019-05-04,20:42:18.600000,2019-05-05 05:31:48.947
7,1.0,-1684971.0,2019-05-04,18:20:53.513333,2019-05-04,18:21:46.796666,2019-05-05 05:31:23.903
6,2.0,-1684806.0,2019-05-04,17:44:47.986666,2019-05-04,17:46:26.800000,2019-05-05 05:31:18.497
5,3.0,-1684701.0,2019-05-04,03:23:54.140000,2019-05-04,03:26:29.310000,2019-05-05 05:30:42.633
4,3.0,-1119964.0,2019-05-03,03:36:18.613333,2019-05-03,03:39:28.660000,2019-05-04 05:30:38.683


In [34]:
#pull zero tripdurations relative to zero tripdistance
query = '''
SELECT tripduration, tripdistance, startdate, starttime, enddate, endtime, create_dt 
FROM trips
WHERE tripduration = 0.0 AND tripdistance = 0;
'''

In [35]:
with engine.connect() as connection:    
    zero_td_df = pd.read_sql(text(query), con = connection)
zero_td_df.sort_values(by = ['startdate' , 'starttime'])

Unnamed: 0,tripduration,tripdistance,startdate,starttime,enddate,endtime,create_dt
1044,0.0,0.0,2019-05-01,00:26:58.376666,2019-05-01,00:27:07.540000,2019-05-02 05:30:26.497
1048,0.0,0.0,2019-05-01,01:11:40.936666,2019-05-01,01:11:52.916666,2019-05-02 05:30:32.827
1043,0.0,0.0,2019-05-01,01:17:28.820000,2019-05-01,01:17:45.083333,2019-05-02 05:30:33.110
1049,0.0,0.0,2019-05-01,02:27:12.186666,2019-05-01,02:27:21.420000,2019-05-02 05:30:40.340
1051,0.0,0.0,2019-05-01,12:46:28.766666,2019-05-01,12:46:39.456666,2019-05-02 05:30:57.173
...,...,...,...,...,...,...,...
1037,0.0,0.0,2019-07-31,23:10:01.343333,2019-07-31,23:10:13.460000,2019-08-01 05:31:20.890
1038,0.0,0.0,2019-07-31,23:10:14.190000,2019-07-31,23:10:30.180000,2019-08-01 05:31:20.860
1032,0.0,0.0,2019-07-31,23:31:15.916666,2019-07-31,23:31:26.800000,2019-08-01 05:31:21.993
1039,0.0,0.0,2019-07-31,23:35:27.006666,2019-07-31,23:35:48.646666,2019-08-01 05:31:22.200


# EDA

### tripduration

#### FOR Q2: remove all trips less than one minute and greater than 24 hours to create a compliant trips table
There are a total of 9154 entries out of compliance with less than 1 minute
There are 6938 rows with trip durations longer than 24 hours  (60*24 = 1440 minutes)

Q. Do we remove all rows that go beyond 24 hours, or do we need to create a recalculated trip distance column with tripduration cappped at 24 hrs?

In [36]:
#for Q2 non-compliance tripduration less than 1 min
query = '''
SELECT tripduration, tripdistance, startdate, starttime, enddate, endtime, create_dt 
FROM trips
WHERE tripduration < 1;
'''

In [37]:
with engine.connect() as connection:    
    non_compliant_td_under = pd.read_sql(text(query), con = connection)
non_compliant_td_under

Unnamed: 0,tripduration,tripdistance,startdate,starttime,enddate,endtime,create_dt
0,0.000000,0.0,2019-07-06,21:30:45.690000,2019-07-06,21:30:55.320000,2019-07-07 05:32:06.680
1,0.183617,0.0,2019-07-06,21:28:25.900000,2019-07-06,21:28:36.916666,2019-07-07 09:52:20.070
2,0.000000,0.0,2019-07-06,21:30:46.720000,2019-07-06,21:31:01.406666,2019-07-07 05:32:06.650
3,0.000000,0.0,2019-07-06,21:29:47.076666,2019-07-06,21:29:56.113333,2019-07-07 05:32:06.710
4,0.280800,0.0,2019-07-06,21:29:36.473333,2019-07-06,21:29:53.323333,2019-07-07 09:52:20.353
...,...,...,...,...,...,...,...
9149,0.000000,0.0,2019-07-06,20:54:45.846666,2019-07-06,20:54:59.346666,2019-07-07 05:31:59.940
9150,0.000000,0.0,2019-07-06,21:04:49.316666,2019-07-06,21:04:59.233333,2019-07-07 05:32:01.673
9151,0.249500,0.0,2019-07-06,21:08:36.233333,2019-07-06,21:08:51.203333,2019-07-07 09:52:18.383
9152,0.000000,0.0,2019-07-06,21:10:10.203333,2019-07-06,21:10:20.500000,2019-07-07 05:32:02.923


In [38]:
#for Q2 non-compliance tripduration greater than 24 hrs
query = '''
SELECT tripduration, tripdistance, startdate, starttime, enddate, endtime, create_dt 
FROM trips
WHERE tripduration > 1440.00;
'''

In [39]:
with engine.connect() as connection:    
    non_compliant_td_over = pd.read_sql(text(query), con = connection)
non_compliant_td_over

Unnamed: 0,tripduration,tripdistance,startdate,starttime,enddate,endtime,create_dt
0,3043.0,18110.24,2019-07-06,20:53:40,2019-07-06,21:44:23,2019-07-07 06:21:18.407
1,9246.0,21830.71,2019-07-06,19:05:41,2019-07-06,21:39:47,2019-07-07 06:21:18.377
2,1972.0,9045.28,2019-07-06,21:14:04,2019-07-06,21:46:56,2019-07-07 06:21:18.590
3,1605.0,5685.70,2019-07-06,21:19:15,2019-07-06,21:46:00,2019-07-07 06:21:18.530
4,3177.0,17260.50,2019-07-06,20:51:39,2019-07-06,21:44:36,2019-07-07 06:21:18.437
...,...,...,...,...,...,...,...
6933,4004.0,38316.93,2019-07-06,19:53:44,2019-07-06,21:00:28,2019-07-07 06:21:17.733
6934,4028.0,38234.91,2019-07-06,19:52:54,2019-07-06,21:00:02,2019-07-07 06:21:17.703
6935,2195.0,15229.66,2019-07-06,20:35:37,2019-07-06,21:12:12,2019-07-07 06:21:17.893
6936,2474.0,16017.06,2019-07-06,20:33:39,2019-07-06,21:14:53,2019-07-07 06:21:17.953


## For Q2 - Compliant Trips

### removed 2b. trips below one minute
### removed 2c. lengths capped at 24 hrs (if we go with removing all)
### create_dt or pubtimestamp dropped

NOTE: We might need geomtery on start/long/lat and end/long/lat for Q4

In [44]:
#for trips_compliant, trips > 1, trips < 1400, no create_dt, no pubdatetime 
query = '''
SELECT *
FROM    (SELECT companyname,  
        triprecordnum, 
        sumdid,
        tripduration,
        tripdistance,
        startdate,
        starttime,
        enddate,
        endtime,
        startlatitude,
        startlongitude,
        endlatitude,
        endlongitude,
        triproute
        FROM trips
        WHERE tripduration < 1440.00) AS compliant_under
WHERE tripduration > 1.00;
'''

In [45]:
with engine.connect() as connection:    
    trips_compliant = pd.read_sql(text(query), con = connection)
trips_compliant

Unnamed: 0,companyname,triprecordnum,sumdid,tripduration,tripdistance,startdate,starttime,enddate,endtime,startlatitude,startlongitude,endlatitude,endlongitude,triproute
0,Bird,BRD5716,PoweredCZP2N,12.000000,0.000000,2019-07-06,21:13:38.430000,2019-07-06,21:25:40.516666,36.162700,-86.775800,36.162200,-86.774500,"[(36.162757, -86.775783), (36.162845, -86.7757..."
1,Bird,BRD5724,PoweredR4SI9,27.000000,3937.008000,2019-07-06,20:58:54.313333,2019-07-06,21:25:36.560000,36.160500,-86.778400,36.164000,-86.796700,"[(36.160533, -86.77833), (36.160399, -86.77825..."
2,Bird,BRD5712,PoweredCS92L,11.000000,984.252000,2019-07-06,21:15:14.356666,2019-07-06,21:25:59.736666,36.151100,-86.796600,36.153100,-86.789800,"[(36.151158, -86.796506), (36.151081, -86.7964..."
3,Bird,BRD5723,Powered9VWF8,25.000000,3608.924000,2019-07-06,20:59:30.983333,2019-07-06,21:24:35.270000,36.164600,-86.776100,36.165200,-86.777500,"[(36.164699, -86.775999), (36.164743, -86.7758..."
4,Lyft,LFT1122,Powered305599,5.328917,1794.619480,2019-07-06,21:21:49.033333,2019-07-06,21:27:08.766666,36.159970,-86.772820,36.161930,-86.774820,"[(36.15997, -86.77282), (36.16001, -86.77291),..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
542003,SPIN,SPI763,Powered2298346,12.000000,1223.753320,2019-07-06,21:11:34,2019-07-06,21:22:46,36.164656,-86.795682,36.163811,-86.791664,"[(36.163811064535764, -86.79166419164731), (36..."
542004,SPIN,SPI777,Powered7701179,5.000000,0.000000,2019-07-06,21:17:48,2019-07-06,21:21:58,36.161326,-86.776441,36.161326,-86.776441,"[(36.161325562468285, -86.77644115198285), (36..."
542005,Bird,BRD5713,PoweredNQ6SQ,11.000000,4593.176000,2019-07-06,21:14:41.263333,2019-07-06,21:25:54.850000,36.151100,-86.796500,36.153500,-86.788900,"[(36.151269, -86.796605), (36.150929, -86.7966..."
542006,Bird,BRD5715,PoweredK8E52,12.000000,5905.512000,2019-07-06,21:14:04.600000,2019-07-06,21:25:59.086666,36.162700,-86.775800,36.164700,-86.776800,"[(36.162874, -86.775767), (36.162905, -86.7758..."


In [48]:
trips_compliant.describe()

Unnamed: 0,tripduration,tripdistance,startlatitude,startlongitude,endlatitude,endlongitude
count,542008.0,542008.0,542008.0,542008.0,542008.0,542008.0
mean,33.425174,4792.318,36.155779,-86.784502,36.156014,-86.784574
std,122.901099,76829.92,0.01253,0.01388,0.120634,0.425965
min,1.000033,-20324800.0,35.8532,-86.918008,-36.850405,-122.673729
25%,5.492508,527.304,36.150321,-86.792,36.150577,-86.791978
50%,10.466667,2690.289,36.15691,-86.781833,36.15745,-86.78157
75%,20.016667,6430.45,36.162886,-86.77644,36.162863,-86.776348
max,1439.0,31884480.0,36.300029,-86.3662,51.045409,174.764886


Triproute = GPS coordinates for entire trip duration at min collection frequency of one per 30 sec

A scatterplot might be helpful to see any trends on starttime & endtime for this chart, but seaborn not in this environment.

The trip distance column appears to be in scientific notation.
 The scientific notation means any number expressed in the power of 10.for example- 340 can be written in scientific notation as 3.4 X102.in pythons, we use str.format() on a number with “{:e}” to format the number to scientific notation. str.format() formats the number as a float, followed by “e+” and the appropriate power of 10. For example- 340 will be displayed as 3.4e+2
https://www.geeksforgeeks.org/display-scientific-notation-as-float-in-python/

To display reverse of scientific numbers to float

We have to pass a variable holding the scientific format of a number, as follows:

x = 3.234e+4
 
print("{:f}".format(x))  # f represents float
Output:

32340.000000

https://stackoverflow.com/questions/658763/how-to-suppress-scientific-notation-when-printing-float-values
https://stackoverflow.com/questions/67879685/python-decimal-decimal-producing-result-in-scientific-notation
The numpy module offers np.format_float_positional()

df1[df1<0].count()

pubdatetime, latitude, longitude - format; create point column
sumdtype - powered; standard
chargelevel - there's a 0 and NaN (same or separate?)
sumdgroup - bicycle, scooter, Scooter
costpermin - 0, 5, 6, 10, 15, 23, 30 cents
scooters.companyname - Bird, Bolt, Gotcha, Jump, Lime, Lyft, Spin
trips.companyname - Bird, Bolt Mobility, Gotcha, JUMP, Lime, Lyft, SPIN (match both name lists)
tripduration - MAX: 512619.0, MIN: -19.358267 (pull all neg numbers in relation to something else)
tripdistance - MAX: 3.188448e+07, MIN: -20324803.8 (what does neg mean?)
startdate - format
starttime - meaning of zero
enddate - format