In [None]:
import pandas as pd
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression


In [4]:
df=sns.load_dataset("taxis")

In [5]:
df.sample(5)

Unnamed: 0,pickup,dropoff,passengers,distance,fare,tip,tolls,total,color,payment,pickup_zone,dropoff_zone,pickup_borough,dropoff_borough
851,2019-03-13 11:31:06,2019-03-13 12:07:56,1,3.29,23.0,2.0,0.0,28.3,yellow,credit card,Upper West Side North,Midtown East,Manhattan,Manhattan
481,2019-03-31 09:52:21,2019-03-31 09:57:46,6,1.19,6.0,1.86,0.0,11.16,yellow,credit card,Penn Station/Madison Sq West,Union Sq,Manhattan,Manhattan
1536,2019-03-04 18:43:24,2019-03-04 19:00:08,5,3.44,14.0,2.5,0.0,20.8,yellow,credit card,Lincoln Square East,Lincoln Square East,Manhattan,Manhattan
1481,2019-03-25 14:01:24,2019-03-25 14:18:26,0,2.5,13.0,2.0,0.0,18.3,yellow,credit card,Greenwich Village North,Midtown Center,Manhattan,Manhattan
3557,2019-03-24 02:35:51,2019-03-24 02:42:12,1,2.4,8.5,2.46,0.0,14.76,yellow,credit card,Lenox Hill West,Kips Bay,Manhattan,Manhattan


In [6]:
df.isnull().sum()

pickup              0
dropoff             0
passengers          0
distance            0
fare                0
tip                 0
tolls               0
total               0
color               0
payment            44
pickup_zone        26
dropoff_zone       45
pickup_borough     26
dropoff_borough    45
dtype: int64

In [8]:
df.duplicated().sum()

np.int64(0)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6433 entries, 0 to 6432
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   pickup           6433 non-null   datetime64[ns]
 1   dropoff          6433 non-null   datetime64[ns]
 2   passengers       6433 non-null   int64         
 3   distance         6433 non-null   float64       
 4   fare             6433 non-null   float64       
 5   tip              6433 non-null   float64       
 6   tolls            6433 non-null   float64       
 7   total            6433 non-null   float64       
 8   color            6433 non-null   object        
 9   payment          6389 non-null   object        
 10  pickup_zone      6407 non-null   object        
 11  dropoff_zone     6388 non-null   object        
 12  pickup_borough   6407 non-null   object        
 13  dropoff_borough  6388 non-null   object        
dtypes: datetime64[ns](2), float64(5), int64(

In [12]:
df.iloc[0]['dropoff']

Timestamp('2019-03-23 20:27:24')

In [17]:
df.dropna(subset=['dropoff_borough','pickup_borough',
                  'dropoff',
                   'payment'],inplace=True)

In [18]:
df.isnull().sum()

pickup             0
dropoff            0
passengers         0
distance           0
fare               0
tip                0
tolls              0
total              0
color              0
payment            0
pickup_zone        0
dropoff_zone       0
pickup_borough     0
dropoff_borough    0
dtype: int64

In [28]:
x=df[['passengers','distance','fare','tip','tolls','payment','color']]
y=df['total']

In [57]:
x.iloc[3]

passengers              1
distance             1.52
fare                  8.0
tip                   1.0
tolls                 0.0
payment       credit card
color              yellow
Name: 9, dtype: object

In [29]:
x.shape

(6336, 7)

In [30]:
y.shape

(6336,)

In [31]:
y.head()

5     12.96
6     18.80
8     19.30
9     13.30
10    17.80
Name: total, dtype: float64

In [23]:
df.head()

Unnamed: 0,pickup,dropoff,passengers,distance,fare,tip,tolls,total,color,payment,pickup_zone,dropoff_zone,pickup_borough,dropoff_borough
5,2019-03-11 10:37:23,2019-03-11 10:47:31,1,0.49,7.5,2.16,0.0,12.96,yellow,credit card,Times Sq/Theatre District,Midtown East,Manhattan,Manhattan
6,2019-03-26 21:07:31,2019-03-26 21:17:29,1,3.65,13.0,2.0,0.0,18.8,yellow,credit card,Battery Park City,Two Bridges/Seward Park,Manhattan,Manhattan
8,2019-03-23 11:48:50,2019-03-23 12:06:14,1,3.63,15.0,1.0,0.0,19.3,yellow,credit card,East Harlem South,Midtown Center,Manhattan,Manhattan
9,2019-03-08 16:18:37,2019-03-08 16:26:57,1,1.52,8.0,1.0,0.0,13.3,yellow,credit card,Lincoln Square East,Central Park,Manhattan,Manhattan
10,2019-03-16 10:02:25,2019-03-16 10:22:29,1,3.9,17.0,0.0,0.0,17.8,yellow,cash,LaGuardia Airport,Astoria,Queens,Queens


In [36]:
X_train,X_test,Y_train,Y_test=train_test_split(x,y,test_size=0.3,random_state=42)

In [39]:
numeric=['passengers','distance','fare','tip','tolls']
category=['color','payment']

In [40]:
numeric

['passengers', 'distance', 'fare', 'tip', 'tolls']

In [41]:
numerical_transformer=Pipeline(steps=[('imputer',SimpleImputer(strategy='median'))])
categorical_transfomer=Pipeline(steps=[('imputer',SimpleImputer(strategy='most_frequent')),(
                                       'encoder',OneHotEncoder(handle_unknown='ignore'))])

In [42]:
preprocessing=ColumnTransformer(transformers=[('numeric',numerical_transformer,numeric),
                                               ('categorical',categorical_transfomer,category)])

In [44]:
pipeline=Pipeline(steps=[('preprocessing',preprocessing),
                       ('regression',LinearRegression())])

In [45]:
pipeline.fit(X_train,Y_train)

In [46]:
predict=pipeline.predict(X_test)

In [49]:
from sklearn.metrics import mean_absolute_error,mean_squared_error 
  
mae = mean_absolute_error(y_true=Y_test,y_pred=predict) 
#squared True returns MSE value, False returns RMSE value. 
mse = mean_squared_error(y_true=Y_test,y_pred=predict) #default=True 

  
print("MAE:",mae) 
print("MSE:",mse) 



MAE: 0.5194157305876717
MSE: 0.6413538318734953


In [50]:
import pickle

In [52]:
# Assuming 'pipeline' is your object
with open('Pipeline_building.pkl', 'wb') as file:
    pickle.dump(pipeline, file)

In [56]:
with open('Pipeline_building.pkl','rb')as file:
    t=pickle.load(file)

In [65]:
import pandas as pd

# Assuming 'values' is your list of input features
values = [3, 1.52, 8.0, 1.0, 0.0, "cash", "yellow"]

# Create a DataFrame with appropriate column names
columns = ["passengers", "distance", "fare", "tip", "tolls", "payment", "color"]
values_df = pd.DataFrame([values], columns=columns)

# Now, you can use the model to predict
t.predict(values_df)

array([12.52439976])