In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [2]:
# Step 1: Data Cleaning
# Load the train and test datasets
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [3]:
# Remove any rows with missing values
train_data.dropna(inplace=True)
test_data.dropna(inplace=True)

In [4]:
# Step 2: Missing Value Analysis
# Check if there are any missing values remaining
print(train_data.isnull().sum())
print(test_data.isnull().sum())


id                    0
vendor_id             0
pickup_datetime       0
dropoff_datetime      0
passenger_count       0
pickup_longitude      0
pickup_latitude       0
dropoff_longitude     0
dropoff_latitude      0
store_and_fwd_flag    0
trip_duration         0
dtype: int64
id                    0
vendor_id             0
pickup_datetime       0
passenger_count       0
pickup_longitude      0
pickup_latitude       0
dropoff_longitude     0
dropoff_latitude      0
store_and_fwd_flag    0
dtype: int64


In [6]:
# Step 3: Exploratory Data Analysis
# Explore the dataset and perform any necessary analysis

# Step 4: Feature Creation
# Extract relevant features from the date and time columns
train_data['pickup_datetime'] = pd.to_datetime(train_data['pickup_datetime'])
train_data['dropoff_datetime'] = pd.to_datetime(train_data['dropoff_datetime'])
train_data['pickup_hour'] = train_data['pickup_datetime'].dt.hour
train_data['pickup_day'] = train_data['pickup_datetime'].dt.day
train_data['pickup_month'] = train_data['pickup_datetime'].dt.month

test_data['pickup_datetime'] = pd.to_datetime(test_data['pickup_datetime'])
test_data['pickup_hour'] = test_data['pickup_datetime'].dt.hour
test_data['pickup_day'] = test_data['pickup_datetime'].dt.day
test_data['pickup_month'] = test_data['pickup_datetime'].dt.month

In [7]:
# Prepare the training data
X = train_data[['pickup_hour', 'pickup_day', 'pickup_month', 'passenger_count', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']]
y = train_data['trip_duration']

In [12]:
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)


In [13]:
# Train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

In [14]:
# Predict on the validation set
y_pred = model.predict(X_val)

In [15]:
# Evaluate the model
mse = mean_squared_error(y_val, y_pred)
mae = mean_absolute_error(y_val, y_pred)

print('Mean Squared Error:', mse)
print('Mean Absolute Error:', mae)


Mean Squared Error: 10173500.339999055
Mean Absolute Error: 596.7040954607078


In [16]:
# Use the trained model to predict on the test set
test_features = test_data[['pickup_hour', 'pickup_day', 'pickup_month', 'passenger_count', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']]
test_predictions = model.predict(test_features)

In [17]:
# Save the predictions to a CSV file
test_data['trip_duration_predicted'] = test_predictions
test_data.to_csv('test_predictions.csv', index=False)