In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score

In [5]:
data = pd.read_csv('/content/2017_Yellow_Taxi_Trip_Data.csv')

In [6]:
data_info = data.info()
data_description = data.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22699 entries, 0 to 22698
Data columns (total 18 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Unnamed: 0             22699 non-null  int64  
 1   VendorID               22699 non-null  int64  
 2   tpep_pickup_datetime   22699 non-null  object 
 3   tpep_dropoff_datetime  22699 non-null  object 
 4   passenger_count        22699 non-null  int64  
 5   trip_distance          22699 non-null  float64
 6   RatecodeID             22699 non-null  int64  
 7   store_and_fwd_flag     22699 non-null  object 
 8   PULocationID           22699 non-null  int64  
 9   DOLocationID           22699 non-null  int64  
 10  payment_type           22699 non-null  int64  
 11  fare_amount            22699 non-null  float64
 12  extra                  22699 non-null  float64
 13  mta_tax                22699 non-null  float64
 14  tip_amount             22699 non-null  float64
 15  to

In [15]:
data_summary = {
    "Null Values": data.isnull().sum().to_dict(),
    "Numeric Variables": data.select_dtypes(include=['number']).columns.tolist(),
    "Categorical Variables": data.select_dtypes(include=['object']).columns.tolist(),
}

In [8]:
# trip_distance
trip_distance_sorted = data.sort_values(by='trip_distance', ascending=False)

# total_amount
total_amount_sorted = data.sort_values(by='total_amount', ascending=False)

In [9]:
# Cleaning
data_cleaned = data[(data['fare_amount'] > 0) & (data['total_amount'] > 0) & (data['trip_distance'] > 0)]
data_cleaned = data_cleaned[(data_cleaned['fare_amount'] < 500) & (data_cleaned['total_amount'] < 500) & (data_cleaned['trip_distance'] < 50)]

# features extraction
features = ['VendorID', 'passenger_count', 'trip_distance', 'RatecodeID', 'payment_type']
target = 'fare_amount'

X = data_cleaned[features]
y = data_cleaned[target]

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
# Train
model = LinearRegression()
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)


In [13]:
# Evaluate
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
accuracy = model.score(X_test, y_test) * 100

In [14]:
answers = {
    "EDA_info": data_info,
    "EDA_description": data_description.to_dict(),
    "Summary": data_summary,
    "Trip_distance_sorted": trip_distance_sorted[['trip_distance', 'fare_amount']].head().to_dict('records'),
    "Total_amount_sorted": total_amount_sorted[['total_amount', 'fare_amount']].head().to_dict('records'),
    "Model Error": mae,
    "Model R-squared": r2,
    "Model Accuracy Percentage": accuracy
}

answers

{'EDA_info': None,
 'EDA_description': {'Unnamed: 0': {'count': 22699.0,
   'mean': 56758486.17128508,
   'std': 32744929.49214842,
   'min': 12127.0,
   '25%': 28520556.0,
   '50%': 56731504.0,
   '75%': 85374524.0,
   'max': 113486300.0},
  'VendorID': {'count': 22699.0,
   'mean': 1.5562359575311688,
   'std': 0.49683839619950737,
   'min': 1.0,
   '25%': 1.0,
   '50%': 2.0,
   '75%': 2.0,
   'max': 2.0},
  'passenger_count': {'count': 22699.0,
   'mean': 1.6423190448918454,
   'std': 1.2852311189940473,
   'min': 0.0,
   '25%': 1.0,
   '50%': 1.0,
   '75%': 2.0,
   'max': 6.0},
  'trip_distance': {'count': 22699.0,
   'mean': 2.9133129212740645,
   'std': 3.6531711828338906,
   'min': 0.0,
   '25%': 0.99,
   '50%': 1.61,
   '75%': 3.06,
   'max': 33.96},
  'RatecodeID': {'count': 22699.0,
   'mean': 1.0433939821137495,
   'std': 0.7083908849941992,
   'min': 1.0,
   '25%': 1.0,
   '50%': 1.0,
   '75%': 1.0,
   'max': 99.0},
  'PULocationID': {'count': 22699.0,
   'mean': 162.412352