## Importing libraries

In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Loading Dataset

In [101]:
df = pd.read_csv('Final_DS_TFP.csv')

## Exploratory Data Analysis

### Checking datatypes and different variables of dataset

In [102]:
# df.info() will display all the information about dataset which includes datatype,Non_null count and memory-usage
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35000 entries, 0 to 34999
Data columns (total 19 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Unnamed: 0             35000 non-null  int64  
 1   VendorID               35000 non-null  float64
 2   tpep_pickup_datetime   35000 non-null  object 
 3   tpep_dropoff_datetime  35000 non-null  object 
 4   passenger_count        35000 non-null  float64
 5   trip_distance          35000 non-null  float64
 6   RatecodeID             35000 non-null  float64
 7   PULocationID           35000 non-null  int64  
 8   DOLocationID           35000 non-null  int64  
 9   payment_type           35000 non-null  float64
 10  fare_amount            35000 non-null  float64
 11  improvement_surcharge  35000 non-null  float64
 12  total_amount           35000 non-null  float64
 13  congestion_surcharge   35000 non-null  float64
 14  store_and_fwd_flag     30000 non-null  object 
 15  ex

### Checking mean,mode,median of all the variables of datasets

In [103]:
# this method will return mean,mode,median and std. deviation of all the variables of dataset
df.describe()

Unnamed: 0.1,Unnamed: 0,VendorID,passenger_count,trip_distance,RatecodeID,PULocationID,DOLocationID,payment_type,fare_amount,improvement_surcharge,total_amount,congestion_surcharge,extra,mta_tax,tip_amount,tolls_amount
count,35000.0,35000.0,35000.0,35000.0,35000.0,35000.0,35000.0,35000.0,35000.0,35000.0,35000.0,35000.0,30000.0,30000.0,30000.0,30000.0
mean,2499.5,1.676,1.410686,3.561623,1.0906,156.511486,153.805543,1.319686,13.450867,0.29532,19.160235,2.130286,0.968642,0.488317,2.190976,0.376508
std,1443.396264,0.468007,1.063471,4.393722,0.88651,67.879297,72.049546,0.527536,13.930782,0.052048,16.292448,0.92839,1.163523,0.094721,2.987772,1.663167
min,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,-400.0,-0.3,-402.8,-2.5,-1.0,-0.5,0.0,-11.75
25%,1249.75,1.0,1.0,1.1,1.0,107.0,90.0,1.0,6.0,0.3,10.8,2.5,0.0,0.5,0.0,0.0
50%,2499.5,2.0,1.0,2.0,1.0,148.0,152.0,1.0,9.0,0.3,14.3,2.5,0.5,0.5,1.85,0.0
75%,3749.25,2.0,1.0,3.94,1.0,230.0,231.0,2.0,15.0,0.3,20.8,2.5,2.5,0.5,2.86,0.0
max,4999.0,2.0,6.0,63.81,99.0,265.0,265.0,4.0,520.0,0.3,624.35,2.5,4.25,0.5,104.05,45.0


#### After above analysis we found some outliers which seemed different and im-proper 
- The fare_amount variable has negative value as minimum value which is unusual
- The passenger_count variable having 0 as minimum no of passengers
- The pickup and dropoff zone's location_Id shouldn't be greter than 263

### Data Cleansing
- Removing missing values
- Removing Outliers


#### Removing records having fare_amount less than 2.5

In [104]:
# removing rows with fare_amount less than 0
df = df[df['fare_amount']>2.5]

#### Removing records having passenger_count as 0

In [105]:

df = df[df['passenger_count']>0]

#### Removing un-necessory location_ids

In [106]:
df = df[df['PULocationID']<=263]
df = df[df['DOLocationID']<=263]


### Checking missing(null) values

In [107]:
df.isna().sum()

Unnamed: 0                  0
VendorID                    0
tpep_pickup_datetime        0
tpep_dropoff_datetime       0
passenger_count             0
trip_distance               0
RatecodeID                  0
PULocationID                0
DOLocationID                0
payment_type                0
fare_amount                 0
improvement_surcharge       0
total_amount                0
congestion_surcharge        0
store_and_fwd_flag       4720
extra                    4720
mta_tax                  4720
tip_amount               4720
tolls_amount             4720
dtype: int64

#### As some variables has null values more than 4000 
- droping the variables having null values more than 4000

In [108]:

df.drop(['store_and_fwd_flag','extra','mta_tax','tip_amount','tolls_amount'],axis=1,inplace=True)

Unnamed: 0.1,Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,PULocationID,DOLocationID,payment_type,fare_amount,improvement_surcharge,total_amount,congestion_surcharge
0,0,1.0,01-01-2021 00:30,01-01-2021 00:36,1.0,2.10,1.0,142,43,2.0,8.0,0.3,11.80,2.5
1,1,1.0,01-01-2021 00:51,01-01-2021 00:52,1.0,0.20,1.0,238,151,2.0,3.0,0.3,4.30,0.0
2,2,1.0,01-01-2021 00:43,01-01-2021 01:11,1.0,14.70,1.0,132,165,1.0,42.0,0.3,51.95,0.0
4,4,2.0,01-01-2021 00:31,01-01-2021 00:48,1.0,4.94,1.0,68,33,1.0,16.5,0.3,24.36,2.5
5,5,1.0,01-01-2021 00:16,01-01-2021 00:24,1.0,1.60,1.0,224,68,1.0,8.0,0.3,14.15,2.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34995,4995,2.0,2021-07-01 06:13:57,2021-07-01 06:19:01,1.0,0.86,1.0,48,170,1.0,5.5,0.3,9.68,2.5
34996,4996,2.0,2021-07-01 06:30:58,2021-07-01 06:33:59,1.0,0.91,1.0,48,163,2.0,4.5,0.3,7.80,2.5
34997,4997,2.0,2021-07-01 06:39:59,2021-07-01 06:51:40,1.0,3.62,1.0,48,75,1.0,12.5,0.3,16.80,2.5
34998,4998,2.0,2021-07-01 06:55:04,2021-07-01 06:58:27,2.0,0.52,1.0,75,75,2.0,4.5,0.3,5.30,0.0
