In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df  = pd.read_csv("uber.csv")

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.columns

In [None]:
df = df.drop(['Unnamed: 0', 'key'], axis= 1)

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.dtypes

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum() 

In [None]:
df['dropoff_latitude'].fillna(value=df['dropoff_latitude'].mean(),inplace = True)
df['dropoff_longitude'].fillna(value=df['dropoff_longitude'].median(),inplace = True)

In [None]:
df.isnull().sum() 

In [None]:
df.dtypes

In [None]:
df.pickup_datetime = pd.to_datetime(df.pickup_datetime, errors='coerce') 

In [None]:
df.head()

In [None]:
df.dtypes

In [None]:
df= df.assign(hour = df.pickup_datetime.dt.hour,
             day= df.pickup_datetime.dt.day,
             month = df.pickup_datetime.dt.month,
             year = df.pickup_datetime.dt.year,
             dayofweek = df.pickup_datetime.dt.dayofweek)

In [None]:
df.head()

In [None]:
df = df.drop('pickup_datetime',axis=1.0)

In [None]:
df.head()

In [None]:
df.dtypes

## Checking outliers and filling them 

In [None]:
df.plot(kind = "box",subplots = True,layout = (7,2),figsize=(15,20))

In [None]:
def remove_outlier(df1 , col):
    Q1 = df1[col].quantile(0.25)
    Q3 = df1[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_whisker = Q1-1.5*IQR
    upper_whisker = Q3+1.5*IQR
    df[col] = np.clip(df1[col] , lower_whisker , upper_whisker)
    return df1

def treat_outliers_all(df1 , col_list):
    for c in col_list:
        df1 = remove_outlier(df , c)
    return df1

In [None]:
df = treat_outliers_all(df , df.iloc[: , 0::])

In [None]:
df.plot(kind = "box",subplots = True,layout = (7,2),figsize=(15,20))

In [None]:
pip install haversine

In [None]:
import haversine as hs 
travel_dist = []
for pos in range(len(df['pickup_longitude'])):
        long1,lati1,long2,lati2 = [df['pickup_longitude'][pos],df['pickup_latitude'][pos],df['dropoff_longitude'][pos],df['dropoff_latitude'][pos]]
        loc1=(lati1,long1)
        loc2=(lati2,long2)
        c = hs.haversine(loc1,loc2)
        travel_dist.append(c)
    
print(travel_dist)
df['dist_travel_km'] = travel_dist
df.head()

In [None]:
df= df.loc[(df.dist_travel_km >= 1) | (df.dist_travel_km <= 130)]
print("Remaining observastions in the dataset:", df.shape)

In [None]:
incorrect_coordinates = df.loc[(df.pickup_latitude > 90) |(df.pickup_latitude < -90) |
                                   (df.dropoff_latitude > 90) |(df.dropoff_latitude < -90) |
                                   (df.pickup_longitude > 180) |(df.pickup_longitude < -180) |
                                   (df.dropoff_longitude > 90) |(df.dropoff_longitude < -90)
                                    ]

In [None]:
df.drop(incorrect_coordinates, inplace = True, errors = 'ignore')

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
sns.heatmap(df.isnull())

In [None]:
corr = df.corr()

In [None]:
corr

In [None]:
fig,axis = plt.subplots(figsize = (10,6))
sns.heatmap(df.corr(),annot = True)

In [None]:
x = df[['pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude','passenger_count','hour','day','month','year','dayofweek','dist_travel_km']]

In [None]:
y = df['fare_amount']

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(x,y,test_size = 0.33)

In [None]:
from sklearn.linear_model import LinearRegression
regression = LinearRegression()

In [None]:
regression.fit(X_train,y_train)

In [None]:
regression.intercept_

In [None]:
regression.coef_ 

In [None]:
prediction = regression.predict(X_test)

In [None]:
print(prediction)

In [None]:
y_test

In [None]:
from sklearn.metrics import r2_score 

In [None]:
r2_score(y_test,prediction)

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
MSE = mean_squared_error(y_test,prediction)

In [None]:
MSE 

In [None]:
RMSE = np.sqrt(MSE)

In [None]:
RMSE

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
rf = RandomForestRegressor(n_estimators=100)

In [None]:
rf.fit(X_train,y_train)

In [None]:
y_pred = rf.predict(X_test)

In [None]:
y_pred

In [None]:
R2_Random = r2_score(y_test,y_pred)

In [None]:
R2_Random

In [None]:
MSE_Random = mean_squared_error(y_test,y_pred)

In [None]:
MSE_Random

In [None]:
RMSE_Random = np.sqrt(MSE_Random)

In [None]:
RMSE_Random