In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("/content/train.csv")
print(" Original Data Loaded")
print(" Shape:", df.shape)
print(" First 5 rows:")
df.head()

✅ Original Data Loaded
🔹 Shape: (22876, 11)
🔹 First 5 rows:


Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1.0,-73.982155,40.767937,-73.96463,40.765602,N,455.0
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1.0,-73.980415,40.738564,-73.999481,40.731152,N,663.0
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1.0,-73.979027,40.763939,-74.005333,40.710087,N,2124.0
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1.0,-74.01004,40.719971,-74.012268,40.706718,N,429.0
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1.0,-73.973053,40.793209,-73.972923,40.78252,N,435.0


In [3]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22876 entries, 0 to 22875
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  22876 non-null  object 
 1   vendor_id           22876 non-null  int64  
 2   pickup_datetime     22876 non-null  object 
 3   dropoff_datetime    22876 non-null  object 
 4   passenger_count     22875 non-null  float64
 5   pickup_longitude    22875 non-null  float64
 6   pickup_latitude     22875 non-null  float64
 7   dropoff_longitude   22875 non-null  float64
 8   dropoff_latitude    22875 non-null  float64
 9   store_and_fwd_flag  22875 non-null  object 
 10  trip_duration       22875 non-null  float64
dtypes: float64(6), int64(1), object(4)
memory usage: 1.9+ MB
None


In [3]:
def preprocess_taxi_data(filepath, fill_missing=True):
    """
    Reads and cleans the NYC Taxi dataset.

    Args:
        filepath (str): Path to the CSV file.
        fill_missing (bool): Whether to fill missing values instead of dropping them.

    Returns:
        pd.DataFrame: Cleaned dataset.
    """
    df = pd.read_csv(filepath)

    print("\n Missing Values Before Cleaning:")
    print(df.isnull().sum())

    if fill_missing:
        print("\n Filling missing values...")
        for col in df.columns:
            if df[col].dtype == 'object':
                mode_val = df[col].mode()[0]
                df[col] = df[col].fillna(mode_val)
                print(f"Filled '{col}' with mode: {mode_val}")
            elif pd.api.types.is_numeric_dtype(df[col]):
                mean_val = df[col].mean()
                df[col] = df[col].fillna(mean_val)
                print(f"➕ Filled '{col}' with mean: {mean_val:.2f}")
            elif pd.api.types.is_datetime64_any_dtype(df[col]):
                df[col] = df[col].fillna(method='ffill')
                print(f"Filled '{col}' using forward fill")
    else:
        before = df.shape[0]
        df = df.dropna()
        after = df.shape[0]
        print(f"Dropped {before - after} rows with missing values")

    outliers = df[(df['trip_duration'] <= 60) | (df['trip_duration'] >= 10000)]
    print(f"\n Found {outliers.shape[0]} outlier rows based on trip_duration")

    df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
    df['dropoff_datetime'] = pd.to_datetime(df['dropoff_datetime'])
    print("Datetime columns converted")
    print(df[['pickup_datetime', 'dropoff_datetime']].head())

    before_outlier_removal = df.shape[0]
    df = df[(df['trip_duration'] > 60) & (df['trip_duration'] < 10000)]
    after_outlier_removal = df.shape[0]
    print(f"Removed {before_outlier_removal - after_outlier_removal} outliers from trip_duration")

    if 'id' in df.columns:
        df = df.drop('id', axis=1)

    print("\n Cleaned Data Info:")
    print(df.info())
    print("\n Descriptive Stats:")
    print(df.describe())

    return df


In [4]:
df_cleaned = preprocess_taxi_data("/content/train.csv", fill_missing=False)



🔍 Missing Values Before Cleaning:
id                    0
vendor_id             0
pickup_datetime       0
dropoff_datetime      1
passenger_count       1
pickup_longitude      1
pickup_latitude       1
dropoff_longitude     1
dropoff_latitude      1
store_and_fwd_flag    1
trip_duration         1
dtype: int64
🧼 Dropped 1 rows with missing values

⚠️ Found 59 outlier rows based on trip_duration
⏱️ Datetime columns converted
      pickup_datetime    dropoff_datetime
0 2016-03-14 17:24:55 2016-03-14 17:32:30
1 2016-06-12 00:43:35 2016-06-12 00:54:38
2 2016-01-19 11:35:24 2016-01-19 12:10:48
3 2016-04-06 19:32:31 2016-04-06 19:39:40
4 2016-03-26 13:30:55 2016-03-26 13:38:10
✅ Removed 59 outliers from trip_duration

📊 Cleaned Data Info:
<class 'pandas.core.frame.DataFrame'>
Index: 7565 entries, 0 to 7623
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   vendor_id           7565 non-null   in