In [None]:
from sqlalchemy import create_engine, text
import pandas as pd
import datetime as dt

#importing sqlalchemy to utilize SQL queries in python

In [None]:
database_name = 'scooter'    

connection_string = f"postgresql://postgres:postgres@localhost:5432/{database_name}"

#connecting scooter database

engine = create_engine(connection_string)

#creating the engine

In [None]:
#creating a database for the trips table.

allcompanies = '''
SELECT *
FROM trips
'''

with engine.connect() as connection:
    allcompanies_df = pd.read_sql(text(allcompanies), con = connection)

In [None]:
#finding min and max dates in the database, should only include 3 months (May, June, July)

date_range= '''
SELECT MIN(pubdatetime), MAX(pubdatetime)
FROM scooters
'''

with engine.connect() as connection:
    date_range_df = pd.read_sql(text(date_range), con = connection)
    
date_range_df

In [None]:
#saving as new dataframe

allcompanies_df.to_csv('../data/allcompanies_df.csv')

In [None]:
#pulling back the new dataframe to more easily assign new datatypes as needed.

trips = pd.read_csv('../data/allcompanies_df.csv')

In [None]:
#cleaning the new dataframe, resetting index and dropping old indexed columns. 

trips = trips.reset_index(drop=True)
trips = trips.drop(columns='Unnamed: 0')
trips.head()

In [None]:
#creating datetime object of trip starttime and startdate, verifying with .info() that type has changed.

trips['startdatetime'] = trips['startdate'] + " " + trips['starttime']
trips['startdatetime'] = pd.to_datetime(trips['startdatetime'])
trips.info()

In [None]:
#creating datetime object of trip endtime and enddate, verifying with .info that type has changed.

trips['enddatetime'] = trips['enddate'] + " " + trips['endtime']
trips['enddatetime'] = pd.to_datetime(trips['enddatetime'])
trips.info()

In [None]:
#verifying cleaning steps and object creation.

trips.head()

In [None]:
#calculating new trip duration based on data given, during previous investigation found that trip duration
#for Bolt Mobility and JUMP was misrepresented. Bolt is assumed to be displayed in seconds, 
#JUMP seems to max at 1440 minutes regardless of start and end times.

trips['tripduration'] = trips['enddatetime'] - trips['startdatetime']

In [None]:
#creating timedeltas for future comparison

oneminute = dt.timedelta(minutes=1)
oneday = dt.timedelta(days=1)

### 2. According to Second Substitute Bill BL2018-1202, all permitted operators will first clean data before providing or reporting data to Metro. Data processing and cleaning shall include:  
     Removal of staff servicing and test trips  
     Removal of trips below one minute  
     Trip lengths are capped at 24 hours
Are the scooter companies in compliance with the second and third part of this rule? 

##### focusing on question 2 first, as the end result will be the data I use to answer the remaining questions. This question serves as a cleaning step, utilizing some groundwork laid above and with the guidelines here, I will filter to only relevant data.

In [None]:
#creating columns to further calculate how many rows and 
#within which direction companies are not compliant.
#additionally beginning to remove trips with a NEGATIVE distance or any trips greater than 20miles
#or 105600 feet, the average scooter range. 

trips['under1minute'] = trips['tripduration'] < oneminute
trips['over1day'] = trips['tripduration'] > oneday
trips['negativedistance'] = trips['tripdistance'] < 0
trips['over20miles'] = trips['tripdistance'] > 105600

In [None]:
#displays the percentage of rows in compliance with the 'removal of trips below one minute'
#guideline, where 'False' represents rows that ARE IN compliance.

trips.groupby(by='companyname').under1minute.value_counts(normalize=True)

In [None]:
#displays the percentage of rows in compliance with the 'trip lengths are capped at 24 hours'
#guideline, where 'False' represents rows that ARE IN compliance.

trips.groupby(by='companyname').over1day.value_counts(normalize=True)

## Q2 Answer
Gotcha, JUMP, and SPIN are in compliance with no trips under 1 minute in length, all other companies are not. 

Bird, Gotcha, and Lime are in compliance with no trips over 24 hours in length, all other companies are not.

No company was in compliance of both rules. Data for Bolt Mobility and JUMP were both misrepresented in the data based on the given data dictionary. Bolt Mobility reported trip durations in seconds. JUMP simply capped the 'tripduration' column at 1440 minutes (24 hours) regardless of the proper calculation between start and end times. 
These were corrected above and before this question was officially answered. 'tripduration' is now a calculated column between 'startdatetime' and 'enddatetime', these are datetime objects created by concatenating relevant information and assigning a new datatype to the created column.

Moving forward, I will remove the offending rows and continue analysis with only rows that are in compliance. 

Initially there are 565522 rows. There are 12535 rows not in compliance. There are 34747 duplicate rows, largely from Lime. Discovered 106 rows with negative tripdistances or tripdistances that were well in excess of 20 miles, removed negatives and filtered data to only trips under 20 miles or the average range of a fully charged SUMD. Filtered and removed from dataset as well. 

In [None]:
#First, checking for any duplicated rows within the data. Exporting duplicates to .csv.

tripsduplicated = trips.loc[trips.duplicated(subset=['sumdid', 'starttime', 'startdate'], keep='first') == True]
tripsduplicated.to_csv('../data/tripsduplicated.csv')
tripsduplicated.shape

In [None]:
#counting duplicates by company

trips.loc[trips.duplicated(subset=['sumdid', 'starttime', 'startdate'], keep='first') == True].value_counts('companyname')

In [None]:
#removing duplicated rows as first part of cleaning.

tripscleaned = trips.drop_duplicates(subset=['sumdid', 'starttime', 'startdate'], keep='first', inplace=False, ignore_index=False)
tripscleaned.shape

In [None]:
#making a dataframe of the trips that are not in compliance, maybe useful for future analysis.
#there are 12535 rows not in compliance. 

tripnegativedistance = trips[(trips['negativedistance'] == True)]
tripover20miles = trips[(trips['over20miles'] == True)]
distanceerrors = pd.concat([tripnegativedistance, tripover20miles], ignore_index=True, axis=0)
notincomplianceminute = trips[(trips['under1minute'] == True)]
notincomplianceday = trips[(trips['over1day'] == True)]
notincompliance = pd.concat([notincomplianceminute, notincomplianceday], ignore_index=True, axis=0)

In [None]:
#finalizing the cleaning steps, dropping the filtering columns. Verifying the new shape matches expectactions.
tripscleaned = tripscleaned[(tripscleaned['negativedistance'] == False) & (tripscleaned['over20miles'] == False)]
tripscleaned = tripscleaned[(tripscleaned['under1minute'] == False) & (tripscleaned['over1day'] == False)]
tripscleaned = tripscleaned.reset_index(drop=True)
tripscleaned = tripscleaned.drop(columns=['under1minute', 'over1day', 'over20miles', 'negativedistance'])
tripscleaned.shape

In [None]:
#creating new .csv files of the cleaned data to continue analysis in another notebook. 

#distanceerrors.to_csv('../data/distanceerrors.csv')
#notincompliance.to_csv('../data/notincompliance.csv')
#tripscleaned.to_csv('../data/tripscleaned.csv')