In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score


In [None]:
from google.colab import drive
drive.mount('/content/drive')

IMPORTING DATA


In [None]:
train_data=pd.read_csv("/content/train.csv")

test_data= pd.read_csv('/content/test.csv')
train_data.head()


FileNotFoundError: ignored

In [None]:
test_data.head()

HANDLING MISSING VALUES`

In [None]:
# Check for missing values in the training data
train_data.isnull().sum()

# Drop rows with missing values or use appropriate imputation techniques
train_data.dropna(inplace=True)


CLEANING DATA AND CONVERTING DATE AND TIME TO CORRECT FORMAT

In [None]:
# Convert datetime columns to datetime format
train_data['pickup_datetime'] = pd.to_datetime(train_data['pickup_datetime'])
train_data['dropoff_datetime'] = pd.to_datetime(train_data['dropoff_datetime'])
test_data['pickup_datetime'] = pd.to_datetime(test_data['pickup_datetime'])

VISUALIZING THE DATA

In [None]:
plt.hist(train_data['trip_duration'], bins=50)
plt.xlabel('Trip Duration')
plt.ylabel('Frequency')
plt.title('Distribution of Trip Duration')
plt.show()


In [None]:
sns.scatterplot(x='pickup_longitude', y='pickup_latitude', data=train_data, color='r', label='Pickup')
sns.scatterplot(x='dropoff_longitude', y='dropoff_latitude', data=train_data, color='b', label='Dropoff')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.title('Pickup and Dropoff Locations')
plt.legend()
plt.show()

In [None]:
correlation_matrix = train_data_encoded[numeric_cols].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()


In [None]:
plt.figure(figsize=(8, 6))
sns.countplot(x='passenger_count', data=train_data_encoded)
plt.xlabel('Passenger Count')
plt.ylabel('Count')
plt.title('Distribution of Passenger Count')
plt.show()


In [None]:
plt.figure(figsize=(8, 6))
sns.boxplot(x='passenger_count', y='trip_duration', data=train_data_encoded)
plt.xlabel('Passenger Count')
plt.ylabel('Trip Duration')
plt.title('Trip Duration by Passenger Count')
plt.show()


In [None]:
train_data_encoded['pickup_date'] = train_data_encoded['pickup_datetime'].dt.date
daily_trip_duration = train_data_encoded.groupby('pickup_date')['trip_duration'].mean()

plt.figure(figsize=(12, 6))
plt.plot(daily_trip_duration.index, daily_trip_duration.values)
plt.xlabel('Date')
plt.ylabel('Average Trip Duration')
plt.title('Average Trip Duration Over Time')
plt.xticks(rotation=45)
plt.show()


DATA EXPLORATION,CONVERTING CATEGORICAL DATA

In [None]:
# Convert categorical data to numeric using one-hot encoding
categorical_cols = ['vendor_id', 'store_and_fwd_flag']
train_data_encoded = pd.get_dummies(train_data, columns=categorical_cols)
test_data_encoded = pd.get_dummies(test_data, columns=categorical_cols)

MODEL TRAINING

In [None]:
# Split the data into features (X) and target variable (y)
X = train_data_encoded.drop(['id', 'trip_duration'], axis=1)
y = train_data_encoded['trip_duration']

In [None]:
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:

# Scale the numerical features
scaler = StandardScaler()
numeric_cols = X_train.select_dtypes(include=np.number).columns
X_train_scaled = X_train.copy()
X_train_scaled = X_train_scaled[numeric_cols]
X_train_scaled[numeric_cols] = scaler.fit_transform(X_train_scaled)

X_val_scaled = X_val.copy()
X_val_scaled = X_val_scaled[numeric_cols]
X_val_scaled[numeric_cols] = scaler.transform(X_val_scaled)

DIFFRENT MODELS LINEAR,MULTIPLE,RIDGE-LASSO

In [None]:
# Train and evaluate the linear regression models
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)
y_pred_lr = lr.predict(X_val_scaled)
mse_lr = mean_squared_error(y_val, y_pred_lr)
r2_lr = r2_score(y_val, y_pred_lr)

In [None]:
# Train and evaluate the multiple linear regression model
mlr = LinearRegression()
mlr.fit(X_train_scaled, y_train)
y_pred_mlr = mlr.predict(X_val_scaled)
mse_mlr = mean_squared_error(y_val, y_pred_mlr)
r2_mlr = r2_score(y_val, y_pred_mlr)




In [None]:
#ridge regression model
ridge = Ridge(alpha=1.0)
ridge.fit(X_train_scaled, y_train)
y_pred_ridge = ridge.predict(X_val_scaled)
mse_ridge = mean_squared_error(y_val, y_pred_ridge)
r2_ridge = r2_score(y_val, y_pred_ridge)

In [None]:
#lasso regression model
lasso = Lasso(alpha=1.0)
lasso.fit(X_train_scaled, y_train)
y_pred_lasso = lasso.predict(X_val_scaled)
mse_lasso = mean_squared_error(y_val, y_pred_lasso)
r2_lasso = r2_score(y_val, y_pred_lasso)

SCORES OF ALL MODELS

In [None]:
#printing all models scores
print("Linear Regression:")
print("MSE:", mse_lr)
print("R2 Score:", r2_lr)
print()
print("Multiple Linear Regression:")
print("MSE:", mse_mlr)
print("R2 Score:", r2_mlr)
print()
print("Ridge Regression:")
print("MSE:", mse_ridge)
print("R2 Score:", r2_ridge)
print()
print("Lasso Regression:")
print("MSE:", mse_lasso)
print("R2 Score:", r2_lasso)

FITTING DATA FOR PREDICTIONS

In [None]:
# Predict trip duration for the test data using the trained model
X_test = test_data_encoded.drop('id', axis=1)
X_test_scaled = X_test.copy()
X_test_scaled = X_test_scaled[numeric_cols]
X_test_scaled[numeric_cols] = scaler.transform(X_test_scaled)

In [None]:
# Linear Regression
y_pred_test_lr = lr.predict(X_test_scaled)
y_pred_test=pd.DataFrame(y_pred_test_lr)
y_pred_test

In [None]:

# Ridge Regression
y_pred_test_ridge = ridge.predict(X_test_scaled)
y_pred_test=pd.DataFrame(y_pred_test_ridge)
y_pred_test

In [None]:
# Lasso Regression
y_pred_test_lasso = lasso.predict(X_test_scaled)
y_pred_test=pd.DataFrame(y_pred_test_lasso)
y_pred_test

In [None]:
# Multiple Linear Regression
y_pred_test_mlr = mlr.predict(X_test_scaled)
y_pred_test=pd.DataFrame(y_pred_test_mlr)
y_pred_test