In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline

In [2]:
df = pd.read_csv("final_internship_data.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500000 entries, 0 to 499999
Data columns (total 26 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   User ID            500000 non-null  object 
 1   User Name          500000 non-null  object 
 2   Driver Name        500000 non-null  object 
 3   Car Condition      500000 non-null  object 
 4   Weather            500000 non-null  object 
 5   Traffic Condition  500000 non-null  object 
 6   key                500000 non-null  object 
 7   fare_amount        500000 non-null  float64
 8   pickup_datetime    500000 non-null  object 
 9   pickup_longitude   500000 non-null  float64
 10  pickup_latitude    500000 non-null  float64
 11  dropoff_longitude  499995 non-null  float64
 12  dropoff_latitude   499995 non-null  float64
 13  passenger_count    500000 non-null  int64  
 14  hour               500000 non-null  int64  
 15  day                500000 non-null  int64  
 16  mo

In [4]:
df.drop_duplicates(inplace=True)

In [5]:
df.dropna(inplace=True)
df.shape

(499995, 26)

In [6]:
# Select only numeric columns
numeric_cols = df.select_dtypes(include='number').columns

# For each numeric column, filter out the outliers
for col in numeric_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Keep only rows where the value in this column is within bounds
    df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]

# Reset index after filtering
df.reset_index(drop=True, inplace=True)



In [7]:

df['Car Condition'].replace({
    'Bad': 0,
    'Good': 1,
    'Very Good': 2,
    'Excellent': 3
}, inplace=True)

print(df['Car Condition'].unique())


[0 2 3 1]


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Car Condition'].replace({
  df['Car Condition'].replace({


In [8]:
df['Traffic Condition'].replace({
    'Flow Traffic': 0,
    'Dense Traffic': 1,
    'Congested Traffic': 2 
}, inplace=True)

print(df['Traffic Condition'].unique())


[2 0 1]


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Traffic Condition'].replace({
  df['Traffic Condition'].replace({


In [9]:
df['weekday'].replace({
    0: 'Monday',
    1: 'Tuesday',
    2: 'Wednesday',
    3: 'Thursday',
    4: 'Friday',
    5: 'Saturday',
    6: 'Sunday'
}, inplace=True)
df['month'] = df['month'].replace({
    1: 'January',
    2: 'February',
    3: 'March',
    4: 'April',
    5: 'May',
    6: 'June',
    7: 'July',
    8: 'August',
    9: 'September',
    10: 'October',
    11: 'November',
    12: 'December'
})



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['weekday'].replace({


In [10]:
df = pd.get_dummies(df, columns=['Weather','month','weekday'], drop_first=True)


In [11]:
drop_cols = ['User ID', 'User Name', 'Driver Name', 'key','pickup_datetime','pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude']
df.drop(columns=drop_cols, inplace=True)


In [101]:
df.head()

Unnamed: 0,Car Condition,Traffic Condition,fare_amount,passenger_count,hour,day,year,jfk_dist,ewr_dist,lga_dist,...,month_May,month_November,month_October,month_September,weekday_Monday,weekday_Saturday,weekday_Sunday,weekday_Thursday,weekday_Tuesday,weekday_Wednesday
0,0,Congested Traffic,5.7,2,0,18,2011,43.597686,33.712082,19.865289,...,False,False,False,False,False,False,False,True,False,False
1,2,Flow Traffic,7.7,1,4,21,2012,42.642965,32.556289,21.063132,...,False,False,False,False,False,True,False,False,False,False
2,0,Congested Traffic,5.3,1,7,9,2010,43.329953,39.406828,15.219339,...,False,False,False,False,False,False,False,False,True,False
3,3,Congested Traffic,12.1,1,9,6,2011,42.335622,32.82493,20.648176,...,False,False,False,False,False,False,False,True,False,False
4,3,Flow Traffic,7.5,1,20,20,2012,42.563234,35.482608,18.113693,...,False,True,False,False,False,False,False,False,True,False


In [12]:
train_list = []
test_list = []

for year, group in df.groupby('year'):
    train = group.sample(frac=0.8, random_state=42)
    test = group.drop(train.index)
    train_list.append(train)
    test_list.append(test)

# Combine all train and test subsets
train_data = pd.concat(train_list)
test_data = pd.concat(test_list)

# Display shapes
print("Training data shape:", train_data.shape)
print("Testing data shape:", test_data.shape)

Training data shape: (273210, 35)
Testing data shape: (68301, 35)


Index(['hour', 'day', 'year', 'jfk_dist', 'ewr_dist', 'lga_dist', 'sol_dist',
       'nyc_dist', 'distance', 'bearing', 'Weather_rainy', 'Weather_stormy',
       'Weather_sunny', 'Weather_windy', 'month_August', 'month_December',
       'month_February', 'month_January', 'month_July', 'month_June',
       'month_March', 'month_May', 'month_November', 'month_October',
       'month_September', 'weekday_Monday', 'weekday_Saturday',
       'weekday_Sunday', 'weekday_Thursday', 'weekday_Tuesday',
       'weekday_Wednesday'],
      dtype='object')

In [67]:
!pip install scikit-learn


Collecting scikit-learn
  Downloading scikit_learn-1.7.1-cp311-cp311-win_amd64.whl.metadata (11 kB)
Collecting scipy>=1.8.0 (from scikit-learn)
  Downloading scipy-1.16.1-cp311-cp311-win_amd64.whl.metadata (60 kB)
     ---------------------------------------- 0.0/60.8 kB ? eta -:--:--
     ------ --------------------------------- 10.2/60.8 kB ? eta -:--:--
     -------------------------------------- 60.8/60.8 kB 802.6 kB/s eta 0:00:00
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.5.1-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.7.1-cp311-cp311-win_amd64.whl (8.9 MB)
   ---------------------------------------- 0.0/8.9 MB ? eta -:--:--
    --------------------------------------- 0.2/8.9 MB 5.9 MB/s eta 0:00:02
   - -------------------------------------- 0.3/8.9 MB 4.2 MB/s eta 0:00:03
   -- ------------------------------------- 0.6


[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: C:\Users\egystar\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [13]:


target_col = "fare_amount"
drop_cols = [target_col]  # Columns to exclude from features

X_train = train_data.drop(columns=drop_cols)
X_test = test_data.drop(columns=drop_cols)
y_train = train_data[target_col]
y_test = test_data[target_col]

df.columns[-31:]
model = LinearRegression()
model.fit(X_train, y_train)



0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [14]:
y_pred = model.predict(X_test)


In [15]:
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("R² Score:", r2_score(y_test, y_pred))


Mean Squared Error: 4.255157505066247
R² Score: 0.6433402733959406


In [18]:
degree = 2 # You can try 3, 4, etc.

model = Pipeline([
    ('poly', PolynomialFeatures(degree=degree)),
    ('linear', LinearRegression())
])
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R² Score:", r2)



Mean Squared Error: 3.5735909521003615
R² Score: 0.7004679684704145


In [19]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluate
print("MSE:", mean_squared_error(y_test, y_pred))
print("R² Score:", r2_score(y_test, y_pred))

MSE: 3.0343017807283936
R² Score: 0.7456702267165665
