<div style="text-align: center; background-color: black; padding: 20px;">
    <h1 style="color:Yellow; font-family: 'Arial', sans-serif;">Taxi Fare Data</h1>
</div>



In [69]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
import matplotlib
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [3]:
df = pd.read_csv("TaxiFare.csv")

In [None]:
df.head()

In [5]:
df.drop("unique_id", axis =1, inplace= True)

In [None]:
df.describe()
# from the tabel it can be concluded that there are extreame outliers in latitude and longitude coulumns.
# The longitude of pickup is varying from -72 to -75. Latitude pickup is mmostly around 40 t 41.

In [None]:
df.isnull().sum() # This dataset contains zero null values

In [8]:
df = df[df['amount'] >= 3]                   # Minimum fare charge is $3.00 .
df = df[df['no_of_passenger'] >= 1]          # Taxi charges only when there's at least one passenger.
df = df[(df['longitude_of_pickup'] <= -72) & (df['longitude_of_pickup'] >= -75)]
df = df[(df['latitude_of_pickup'] <= 42) & (df['latitude_of_pickup'] >= 40)]
df = df[(df['longitude_of_dropoff'] <= -72) & (df['longitude_of_dropoff'] >= -75)]
df = df[(df['latitude_of_dropoff'] <= 42) & (df['latitude_of_dropoff'] >= 40)]

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
sns.scatterplot( df.amount, color = 'b')
sns.set_style("dark")
plt.title("Amount Distribution", fontsize = 18, color = 'b', weight = 'bold')
plt.grid()
plt.ylabel("Amount")
plt.xlabel("Count")

In [12]:
# It can be interpreted that the amount is mostly upto $60 and the rest are outliers.

In [13]:
df = df[df.amount < 50]

In [None]:
df.shape

In [None]:
df.amount.describe()

In [None]:
df.dtypes

In [16]:
df['date_time_of_pickup'] = pd.to_datetime(df['date_time_of_pickup'])
df["hour"] = df.date_time_of_pickup.dt.hour
df["month"] = df.date_time_of_pickup.dt.month
df["weekday"] = df.date_time_of_pickup.dt.weekday

In [17]:
df.drop("date_time_of_pickup", axis =1, inplace =True)

In [None]:
df.describe()

In [19]:
# to calclate distance we use  Haversine formula.
# creating a function that will take co-ordinated as data and will return the distance travelled.

In [20]:
def haversine_distance(lat1, lon1, lat2, lon2):
    R = 6371  # Earth radius in kilometers
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat / 2.0) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2.0) ** 2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    distance = R * c
    return distance

In [None]:
df["distance_km"] = haversine_distance(df.latitude_of_pickup ,df.longitude_of_pickup, df.latitude_of_dropoff, df.longitude_of_dropoff)
df.distance_km.describe()
# we have minimum distance 0 that means the person must have not took taxi or traveeled and came back to same place.

In [None]:
df[df.distance_km==0].shape # there are 471 rows that have same pickup and dropoff co ordinates.
# Because of which distance travelled is zero and it makes no sense.

In [None]:
df= df[df.distance_km > 0.1]
df.distance_km.describe()

## Model Building

In [24]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [25]:
lr = LinearRegression()

In [None]:
df.describe()

In [None]:
plt.figure(figsize=(12, 10))

plt.subplot(2, 3, 1)
sns.histplot(df['amount'])
plt.title("Amount")

plt.subplot(2, 3, 2)
sns.histplot(df['longitude_of_pickup'])
plt.title("Pickup Longitude")

plt.subplot(2, 3, 3)
sns.histplot(df['latitude_of_pickup'])
plt.title("Pickup Latitude")

plt.subplot(2, 3, 4)
sns.histplot(df['longitude_of_dropoff'])
plt.title("Dropoff Longitude")

plt.subplot(2, 3, 5)
sns.histplot(df['latitude_of_dropoff'])
plt.title("Dropoff Latitude")

plt.subplot(2, 3, 6)
sns.histplot(df['distance_km'])
plt.title("Distance")

plt.tight_layout()

In [None]:
plt.figure(figsize = (8, 8))


plt.subplot(1,2,1)
sns.scatterplot(x = df.amount, y = df.distance_km) # there are definately some outliers in distance that needed to fixed.
plt.title("Distance vs Amount with outliers")
plt.grid()

df = df[df["distance_km"]<= 25]
plt.subplot(1,2,2)
sns.scatterplot(x = df.amount, y =df.distance_km)
plt.title("Distance vs Amount Filtered")

plt.grid()

In [29]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

In [30]:
x = df.drop("amount", axis =1)
y = df.amount

In [31]:
x_scaled = scaler.fit_transform(x)

In [32]:
x_train, x_test, y_train, y_test = train_test_split(x_scaled, y , test_size= 0.3, random_state=42)

In [None]:
lr.fit(x_train, y_train)
lr.score(x_train, y_train)

In [None]:
lr.score(x_test, y_test)

# Trial and error

## KNN

In [52]:
from sklearn.neighbors import KNeighborsRegressor

In [36]:
knn_model = KNeighborsRegressor(n_neighbors=5)

In [None]:
knn_model.fit(x_train, y_train)

In [None]:
knn_model.score(x_train, y_train)

In [None]:
knn_model.score(x_test, y_test)

## Decision Tree

In [40]:
from sklearn.tree import DecisionTreeRegressor

In [41]:
tree_model = DecisionTreeRegressor(max_depth = 7)

In [None]:
tree_model.fit(x_train, y_train)

In [None]:
tree_model.score(x_train, y_train)

In [None]:
tree_model.score(x_test, y_test)

## Random Forest

In [45]:
from sklearn.svm import SVR

In [46]:
svr = SVR(kernel = 'linear',)

In [None]:
svr.fit(x_train, y_train)

In [None]:
svr.score(x_train, y_train)

In [49]:
svr.score(x_test, y_test)

0.797394181949618

In [50]:
# can try after removing latitude and longitude

# After removing co ordinates column

In [73]:
x = df.drop(['amount','longitude_of_pickup','latitude_of_pickup','longitude_of_dropoff'], axis=1)
y = df.amount

In [54]:
x_scaled = scaler.fit_transform(x)

In [53]:
x_train, x_test, y_train, y_test = train_test_split(x_scaled, y , test_size= 0.3, random_state=42)

In [None]:
lr.fit(x_train, y_train)
lr.score(x_train, y_train)

In [None]:
lr.score(x_test, y_test)