In [1]:
!pip install geopy

Collecting geopy
  Using cached geopy-2.4.1-py3-none-any.whl.metadata (6.8 kB)
Collecting geographiclib<3,>=1.52 (from geopy)
  Using cached geographiclib-2.1-py3-none-any.whl.metadata (1.6 kB)
Using cached geopy-2.4.1-py3-none-any.whl (125 kB)
Using cached geographiclib-2.1-py3-none-any.whl (40 kB)
Installing collected packages: geographiclib, geopy

   -------------------- ------------------- 1/2 [geopy]
   -------------------- ------------------- 1/2 [geopy]
   ---------------------------------------- 2/2 [geopy]

Successfully installed geographiclib-2.1 geopy-2.4.1


In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from geopy.distance import great_circle

# Load your dataset
# Replace 'your_dataset.csv' with the actual path to your dataset file
data = pd.read_csv('uber.csv')

# Check for and drop rows with missing or invalid coordinates
data = data.dropna(subset=['pickup_latitude', 'pickup_longitude', 'dropoff_latitude',
'dropoff_longitude'])
data = data[(data['pickup_latitude'] >= -90) & (data['pickup_latitude'] <= 90)]
data = data[(data['pickup_longitude'] >= -180) & (data['pickup_longitude'] <= 180)]
data = data[(data['dropoff_latitude'] >= -90) & (data['dropoff_latitude'] <= 90)]
data = data[(data['dropoff_longitude'] >= -180) & (data['dropoff_longitude'] <= 180)]

# Calculate the distance for each row and create a new column 'distance'
data['distance'] = data.apply(lambda row: great_circle(
(row['pickup_latitude'], row['pickup_longitude']),
(row['dropoff_latitude'], row['dropoff_longitude'])).miles, axis=1)

# Save the updated DataFrame to a new CSV file

# Replace 'updated_dataset.csv' with the desired file name
data.to_csv('uber.csv', index=False)
# Step 1: Pre-process the dataset
# Assume you have columns like 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude','dropoff_latitude', 'distance', and 'price'.

# Handle missing values if any
data.dropna(inplace=True)

# Encode categorical features if necessary
# You may need to convert categorical variables into numerical format using techniques likeone-hot encoding.

# Split the dataset into features (X) and target (y)
X = data[['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude',
'distance']]
y = data['fare_amount']
# Step 2: Identify outliers
# You can use different methods like z-score or IQR to detect outliers.
from scipy import stats
z_scores = np.abs(stats.zscore(data['distance']))
outliers = (z_scores > 3)
data = data[~outliers]
# Step 3: Check the correlation
correlation_matrix = data.corr()
print(correlation_matrix)
# Step 4: Implement linear regression, Ridge, and Lasso regression models
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Initialize the models
linear_reg_model = LinearRegression()
ridge_model = Ridge(alpha=100) # You can adjust the alpha parameter
lasso_model = Lasso(alpha=0.01) # You can adjust the alpha parameter
# Train the models
linear_reg_model.fit(X_train_scaled, y_train)
ridge_model.fit(X_train_scaled, y_train)
lasso_model.fit(X_train_scaled, y_train)

# Step 5: Evaluate the models and compare their respective scores
def evaluate_model(model, X, y):
    y_pred = model.predict(X)
    r2 = r2_score(y, y_pred)
    rmse = np.sqrt(mean_squared_error(y, y_pred))
    return r2, rmse
linear_reg_r2, linear_reg_rmse = evaluate_model(linear_reg_model, X_test_scaled, y_test)
ridge_r2, ridge_rmse = evaluate_model(ridge_model, X_test_scaled, y_test)
lasso_r2, lasso_rmse = evaluate_model(lasso_model, X_test_scaled, y_test)
print("Linear Regression R2 Score:", linear_reg_r2)
print("Linear Regression RMSE:", linear_reg_rmse)
print("Ridge Regression R2 Score:", ridge_r2)
print("Ridge Regression RMSE:", ridge_rmse)
print("Lasso Regression R2 Score:", lasso_r2)
print("Lasso Regression RMSE:", lasso_rmse)
# For example, you can create scatter plots of actual vs. predicted prices.
plt.scatter(y_test, linear_reg_model.predict(X_test_scaled), label='Linear Regression',
alpha=0.5)

plt.scatter(y_test, ridge_model.predict(X_test_scaled), label='Ridge Regression', alpha=0.5)
plt.scatter(y_test, lasso_model.predict(X_test_scaled), label='Lasso Regression', alpha=0.5)
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.legend()
plt.show()

ValueError: could not convert string to float: '2015-05-07 19:52:06.0000003'