In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
data = sns.load_dataset('taxis')

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data.isnull().sum()

In [None]:
data.dropna(inplace = True,axis = 0)

In [None]:
data.isnull().sum()

In [None]:
data.shape

In [None]:
data[data.duplicated()]

In [None]:
for i in range(len(data.columns)):
    print(data.columns[i])

In [None]:
data.columns

In [None]:
categorical_features = [ i for i in data.columns if(data[i].dtypes == 'object')]
numerical_features =  [i for i in data.columns if(data[i].dtypes != 'object') ]
categorical_features,numerical_features

In [None]:
df = data[numerical_features]
df.head()

In [None]:
df.corr()

In [None]:
plt.figure(figsize = (10,10))
sns.heatmap(df.corr(),annot = True)

In [None]:
df['duration'] = df['dropoff'] - df['pickup']
df.drop(['pickup','dropoff'],axis = 1,inplace = True)

In [None]:

df['duration'] = df['duration'].astype('str')
df.head()

In [None]:
df['duration_min'] = df['duration'].str.split('days').str[1].str.split(':').str[0].astype('int')*60 + df['duration'].str.split('days').str[1].str.split(':').str[1].astype('int')

In [None]:
df.drop('duration',axis = 1,inplace = True)
df.head()

In [None]:
x = df[['passengers','distance','tolls','duration_min']]
y = df['fare']

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2,random_state = 42)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [None]:
len(x_train),len(y_train)

In [None]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(x_train_scaled,y_train)
regressor.intercept_

In [None]:
regressor.coef_

In [None]:
y_test_pred = regressor.predict(x_test_scaled)

In [None]:
from sklearn.metrics import mean_squared_error,r2_score
mean_squared_error(y_test,y_test_pred),r2_score(y_test,y_test_pred)

In [None]:
from sklearn.linear_model import Ridge,Lasso,ElasticNet
ridge = Ridge()
lasso = Lasso()
elastic = ElasticNet()

In [None]:
ridge.fit(x_train_scaled,y_train)
lasso.fit(x_train_scaled,y_train)
elastic.fit(x_train_scaled,y_train)
y_test_pred_ridge = ridge.predict(x_test_scaled)
y_test_pred_lasso = lasso.predict(x_test_scaled)
y_test_pred_elastic = elastic.predict(x_test_scaled)

In [None]:
print('By Ridge',mean_squared_error(y_test,y_test_pred_ridge),r2_score(y_test,y_test_pred_ridge))
print('By Lasso',mean_squared_error(y_test,y_test_pred_lasso),r2_score(y_test,y_test_pred_lasso))
print('By ElasticNet',mean_squared_error(y_test,y_test_pred_elastic),r2_score(y_test,y_test_pred_elastic))

In [None]:
lis = []
def model_selection(model):
  prediction = model.predict(x_test_scaled)
  print(f'Mean Squared Error by {model}',mean_squared_error(y_test,prediction))
  print(f'R2 Score by {model}',r2_score(y_test,prediction))
  print('\n')
  lis.append(r2_score(y_test,prediction))
models = [regressor,ridge,lasso,elastic]
for i in models:
  model_selection(i)
arr = np.array(lis)
idx = arr.argmax()
print(f"Best Model is {models[idx]}")

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
parameters = {
    'criterion' : ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
    'splitter' : ['best', 'random'],
    'max_depth' : [1,2,3,4,5,6,7,8,9,10],
}
clf = GridSearchCV(DecisionTreeRegressor(),param_grid = parameters,cv = 5)
clf.fit(x_train_scaled,y_train)
clf.best_params_

In [None]:
dt = DecisionTreeRegressor(criterion = 'absolute_error', max_depth =  7,splitter = 'best')
dt.fit(x_train_scaled,y_train)
y_test_pred_dt = dt.predict(x_test_scaled)
r2_score(y_test,y_test_pred_dt),mean_squared_error(y_test,y_test_pred_dt)

In [None]:
from sklearn import tree
plt.figure(figsize = (20,20))
tree.plot_tree(dt,filled = True,max_depth = 2)