In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'taxi-fare-guru-total-amount-prediction-challenge:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F61246%2F6604167%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240612%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240612T185120Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D49bb2f31eca301512ecef94de36c9e4ec8c47bc35fcae8f0ef43ff3d9f5a694672bca928ab2d416b71ef70db4f2dc166ae3aa3b4821ed23b183e81c9073e2735b49c43e91c01847aab59c71704afb10bd218273b446450259f22994555e2dbf7c32749287f48d96c30cd4255f970ad0cb1b8999169ef59dcadc6cd270cd03312b9150bd1c17d59fe03c1b8432739faf33e70bb491987a0c0456bd3d037eeab0427caac30aa1e5d79e061f1cc49f337c301d6b6676f5e515facaf374e833d87b8a6952bf166f1f40761e0d951b59b0a180188654c11849c42146d35bdc0cd72ea848b8e33af8c56c4572aed93799db9a18809422ced14670f72c2577b9a902b0d'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#from sklearn.dummy import DummyClassifier


# Calling the necessary libraries

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, Lasso, Ridge,RidgeCV, SGDRegressor
from sklearn.model_selection import GridSearchCV,cross_val_score, cross_validate,KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures,MinMaxScaler,OneHotEncoder, StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor,RandomForestRegressor
from xgboost import XGBRegressor
import xgboost as xg


import seaborn as sns
import matplotlib.pyplot as plt



# Load your dataset (replace 'your_dataset.csv' with the actual filename)
df = pd.read_csv('/kaggle/input/taxi-fare-guru-total-amount-prediction-challenge/train.csv')


**EDA**

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df['payment_type'].unique()

In [None]:
missing_cat=df.loc[df['payment_type']== 'unknown']

In [None]:
missing_cat

In [None]:
df.isnull().sum()

In [None]:
df[df.isna().any(axis=1)]

**As can be seen the missing values are present in the columns ['passenger_count', 'RatecodeID', 'store_and_fwd_flag', 'congestion_surcharge', 'Airport_fee'] of which ''store_and_fwd_flag' has values of object type as it is a cetegorical variable, the rest have values of float64 type. But RatecodeID is also a categorical variable so we need to impute values for it like a categorical variable. Another object type column is 'payment_type' it has 'unknown' as one of its missing values so but we have to impute missing values for it also. The target variable 'total_amount' does not have any missing values so we do not drop any rows and for the other missing values we choose to impute**

In [None]:
df['RatecodeID'].unique()

In [None]:
df

**Feature Engineering**

**For better solving the 'tpep_pickup_datetime', 'tpep_dropoff_datetime' objects here they are coverted to datetime objects. Suspecting that day,month, hour of pickup could play a role we separate them here.**

In [None]:
df[['tpep_pickup_datetime', 'tpep_dropoff_datetime']] = df[['tpep_pickup_datetime', 'tpep_dropoff_datetime']].apply(
    pd.to_datetime, format='%Y-%m-%d %H:%M:%S', errors='raise')
df['time_difference_minutes'] = abs((df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds() / 60)
df['hour_of_pickup'] = df['tpep_pickup_datetime'].dt.hour
df['day_of_pickup'] = df['tpep_pickup_datetime'].dt.dayofweek
df['month_of_pickup'] = df['tpep_pickup_datetime'].dt.month


df=df.drop(['tpep_pickup_datetime','tpep_dropoff_datetime'],axis = 1)

Year is same for all date of pickups, so a new column year is not created.

In [None]:
df['day_of_pickup'].unique()

# 'total_amount' is our target variable so we separate target variable from features here

In [None]:
df_features = df.drop(columns = ['total_amount'])
targetlabel= df['total_amount']


# Data preprocessing

In [None]:
ct1= ColumnTransformer([
    ('imputer1', SimpleImputer(missing_values = np.nan,strategy = 'median'),['passenger_count'] ),
    ('imputer2', SimpleImputer(missing_values=np.nan, strategy = 'most_frequent'),['RatecodeID','store_and_fwd_flag']),
    ('imputer3', SimpleImputer(missing_values=np.nan, strategy = 'mean'),['congestion_surcharge', 'Airport_fee']),
    ('imputer4', SimpleImputer(missing_values='unknown', strategy = 'most_frequent'),['payment_type'])
],
    remainder='passthrough',
    verbose_feature_names_out= False,

).set_output(transform='pandas')
df_imputed = ct1.fit_transform(df_features)
df_imputed

In [None]:
df_imputed['RatecodeID'].unique()

In [None]:
df_imputed.isna().sum()

In [None]:
df_imputed['payment_type'].unique()

**Correlation matrix for different Features**

In [None]:
num_columns = df.select_dtypes(include=['number']).columns
correlation_matrix = df[num_columns].corr()


plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix of Numerical Features')
plt.show()

In [None]:
df['PULocationID'].unique()

In [None]:
ct2 = ColumnTransformer([
    ('ohe', OneHotEncoder(sparse_output= False),['store_and_fwd_flag','payment_type','RatecodeID','VendorID','month_of_pickup']),
    #('mms', MinMaxScaler(),['trip_distance','congestion_surcharge','Airport_fee','tip_amount','tolls_amount', 'extra','time_difference_minutes']
    ('stdscl', StandardScaler(),['PULocationID','DOLocationID','trip_distance','congestion_surcharge','Airport_fee','tip_amount','tolls_amount', 'extra','time_difference_minutes','hour_of_pickup','day_of_pickup'])
],
remainder = 'passthrough',
verbose_feature_names_out= False).set_output(transform='pandas')
df_pre = ct2.fit_transform(df_imputed)
df_pre

As there is 175000 rows in the train data. The 10% of that should suffice for testing while a great number of data points will train the model.

In [None]:
df_train_features, df_test_features, df_train_label, df_test_label= train_test_split(df_pre,targetlabel, test_size=0.1, random_state=42)

In [None]:



#a box plot for the newly created'runtimeMinutes'
sns.boxplot(x=df['time_difference_minutes'])

#a title and label for the y-axis
plt.title('Box Plot of time difference in Minutes')
plt.ylabel('time diffrence in minutes')

# the plot
plt.show()

In [None]:
df_train_features.shape


In [None]:
df_test_label.shape

# Model training
For better speed I have commented out the computationally exhaustive Gridsearch and other low performing models. If required one can easily check the results after removing the ''' signs for multiline comments and # for single line comments

**Linear Regression(Baseline Model)**

In [None]:
lr= LinearRegression()
lr.fit(df_train_features,df_train_label)

In [None]:
lr.score(df_test_features,df_test_label)

From the baseline linear regression model without any hyperparameter tuning the score from the test data is coming to be 0.7787640516776282
The score on the train.csv is coming as 0.7534. Indicates moderate performance, although failed to meet the cutoff.

In [None]:
'''gd= SGDRegressor(
    random_state=42,
    max_iter=1000,
    warm_start=True,
    alpha=0.01,
    early_stopping=True,
    penalty='l1')
sgd.fit(df_train_features,df_train_label)'''

In [None]:
#sgd.score(df_test_features,df_test_label)

SGDRegressor is performing poorly on this dataset. Giving score of -1.41.Failing to even meet the dummy regression using mean.


**Cross Validation on Linear Regression**

In [None]:
#lr_cv_scores = cross_val_score(lr,X=df_train_features,y=df_train_label, cv=5)

In [None]:
#lr_cv_scores

In [None]:
#lr_cv_scores.mean()

The scores from each fold is quite close. And the mean is almost same as the score we got earlier from the linear regression model. It can be concluded that the model generalizes well on this dataset and is quite robust.

**Ridge and Lasso with hyper parameter tuning**

In [None]:
'''rdg_pipel = Pipeline([
    ("poly", PolynomialFeatures(interaction_only=True)),
    ("Lasso", Ridge())
    ])

param_grid = {'poly__degree': (1,2),
              'Lasso__alpha': (0.1,1,10,100,1000,10000)}

grid_search = GridSearchCV(estimator= rdg_pipel,
                                param_grid = param_grid,
                                scoring = "r2",
                                return_train_score=False,
                                cv = 5,
                                n_jobs = -1,
                                verbose= 1)
grid_search.fit(df_train_features,df_train_label)

# Get the best hyperparameters
best_params = grid_search.best_params_'''


In [None]:
#best_params

In [None]:
'''rdg_pipeline_hpt = Pipeline([
    ("poly", PolynomialFeatures(degree=2,interaction_only=True)),
    ("Lasso", Ridge( alpha=1))
    ])
rdg_pipeline_hpt.fit(df_train_features,df_train_label)
rdg_pipeline_hpt.score(df_test_features,df_test_label)
'''

In [None]:
'''grid_pipeline = Pipeline([
    ("poly", PolynomialFeatures(interaction_only=True)),
    ("Lasso", Lasso(warm_start=True))
    ])

param_grid = {'poly__degree': (1,2),
              'Lasso__alpha': (0.1,1,10,100,1000,10000)}
lasso_gridsearch = GridSearchCV(estimator= grid_pipeline,
                                param_grid = param_grid,
                                scoring = "r2",
                                return_train_score=False,
                                cv = 5,
                                n_jobs = -1,
                                verbose= 1 )
lasso_gridsearch.fit(df_train_features,df_train_label)'''

In [None]:
#lasso_gridsearch.best_params_

In [None]:
'''lasso_pipeline_hpt = Pipeline([
    ("poly", PolynomialFeatures(degree=2,interaction_only=True)),
    ("Lasso", Lasso(warm_start=True, alpha=10))
    ])
lasso_pipeline_hpt.fit(df_train_features,df_train_label)
lasso_pipeline_hpt.score(df_test_features,df_test_label)'''


So here Gridsearch CV is performed both with Lasso and Ridge estimators with hyperparameter tuning. The best result on test score comes out to be with Ridge(score on train 0.8083313803744253) with the hyperparameters of alpha = 1 and polynomial degree 2. For Lasso the best fit polynomial degree is 2 and alpha =10, score on training set 0.5543734285945301.

In [None]:
'''lr_pipeline = Pipeline([
    ("poly", PolynomialFeatures(degree=2,interaction_only=True)),
    ("Linreg", LinearRegression())
    ])
lr_pipeline.fit(df_train_features,df_train_label)
lr_pipeline.score(df_test_features,df_test_label)'''

The best polynomial degree for both the Lasso and Ridge estimator came out to be 2. This made me investigate the baseline linear regression model with polynomial fit of degree 2. With this fit the score on test data from my train set came out to be 0.9251666749814946 while the score from kaggle competition submission with this fit is -0.96611 signifying that model is overfit on the train data.

# K-Nearest Neighbors Regressor

In [None]:
'''from sklearn.metrics import make_scorer
knn_regressor = KNeighborsRegressor()
param_grid = {'n_neighbors': [2,3,4, 5, 7, 9, 11]}

knn_gridsearch = GridSearchCV(knn_regressor, param_grid, cv=5, scoring=make_scorer(r2_score))

knn_gridsearch.fit(df_train_features,df_train_label)

# Get the best hyperparameters
best_params = knn_gridsearch.best_params_
print('best_params')

# Get the best model
best_knn_model = knn_gridsearch.best_estimator_

# Make predictions on the test set using the best model
y_pred = best_knn_model.predict(df_test_features)

# Evaluate the best model
mse = mean_squared_error(df_test_label, y_pred)
r2 = r2_score(df_test_label, y_pred)
'''

The best parameter is coming out to be n_neighbors = 3. The score with this parameter on Kaggle submission test set is 0.77772. Better than the linear regressor baseline model but still not quite made the cutoff. Also at first my knn model was giving score in negative. I found out that Minmaxscalar does not work well with knn regressor.

# Support Vector Regressor

In [None]:
'''param_grid ={
    'kernel': ['linear', 'rbf'],
    'C': [0.1, 1],
    'epsilon': [0.1, 0.01]
}j
svreg_grid= GridSearchCV(SVR(),param_grid,cv=5)
svreg_grid.fit(df_train_features, df_train_label)'''

In [None]:
'''svreg = SVR()
svreg.fit(df_train_features, df_train_label)
svreg.score(df_test_features,df_test_label)'''

support vector regression from my study seems to be quite computationally exhaustive. The Grid search in this notebook without any accelarator was not yielding any results for a long time. So I decided to train with default hyper parameters.If the score on my test set and kaggle submission crosses the threshold, I will tune the hyperparameters. And test score is coming out to be 0.762094537474704. A close score to my baseline linear regressor model. The kaggle submission score is 0.7308 so I will not be investigating it anymore.

# Boosting algorithms
**Gradient boosting regressor**

In [None]:
'''param_grid = {
    'n_estimators' : [50,100,200],
    'learning_rate' : [0.01,0.1,0.2],
    'max_depth' : [3,5,7]
}
gradboost_grid = GridSearchCV(GradientBoostingRegressor(),param_grid,cv=3,n_jobs=-1,scoring='r2')
gradboost_grid.fit(df_train_features, df_train_label)
bestparams=gradboost_grid.best_params_'''

In [None]:
#bestparams

For the gradient boosting regressor the best hyperparameters from the above code came out to be {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200}

In [None]:
gradboost_reg=GradientBoostingRegressor(learning_rate=0.1,max_depth= 7, n_estimators= 200 )
gradboost_reg.fit(df_train_features,df_train_label)

In [None]:
gradboost_reg.score(df_test_features,df_test_label)

For these hyperparameters the score from my test data came out to be 0.9675702061937632. So I submitted the model on kaggle competition where it scored 0.95025. As this is giving me a good score I am going to try the XGBoostRegressor from XGBoost library

**XGBoost regressor**

Using Gridsearchcv to tune the hyper parameters for xgboost was taking a lot of computational resources. So I tested vanilla xgbregressor and xgbregressor with the hyperparameters I had obtained from the Gradient Boosting grid seach cv

**XGBRegressor without any hyperparameter tuning**

In [None]:
xgboostreg = XGBRegressor()
xgboostreg.fit(df_train_features,df_train_label)
xgboostreg.score(df_train_features,df_train_label)

The score on my sample test set using the above code is 0.9783256227958561. The kaggle submission score is 0.94811

**XGB regressor using the hyper parameters obtained from gradient boosting regressor grid search.**

In [None]:
xgboostreg_gb = XGBRegressor(n_estimators=200, max_depth=7, eta=0.1)
xgboostreg_gb.fit(df_train_features,df_train_label)
xgboostreg_gb.score(df_train_features,df_train_label)

Now the score on my test set is 0.9808093846552104. The score on kaggle submission is 0.95173. A slight improvement from my Gradient boosting regressor score on kaggle submission, but a significant improvement than the vanilla xgbregressor without any hyperparameter tuning.

**XGB regressor with gridsearch**

In [None]:
'''param_grid = {
    'n_estimators' : [50,100,200],
    'eta' : [0.01,0.1,0.2],
    'max_depth' : [3,5,7],
    'min_child_weight': [1,2,3]
}
xgboost_grid = GridSearchCV(XGBRegressor(),param_grid,cv=3,n_jobs=-1,scoring='r2')
xgboost_grid.fit(df_train_features, df_train_label)
bestparams=xgboost_grid.best_params_'''

In [None]:
#bestparams

Using the grid search above I found the best paramas to be (n_estimators=100, max_depth=7,min_child_weight = 2,eta=0.1).

In [None]:
xgboostreg_hpt = XGBRegressor(n_estimators=100, max_depth=7,min_child_weight = 2,eta=0.1)
xgboostreg_hpt.fit(df_train_features,df_train_label)
xgboostreg_hpt.score(df_train_features,df_train_label)

Using the hyperparameters obtained from grid search the score on my test set was found to be 0.9737979928429287, while score on kaggle submission was 0.95033. Unexpectedly the score did not improve from the score found from the parameters from gradient boosting.

**DecisionTree Regressor**

In [None]:
'''param_grid = {

    'max_features': [1.0, None],
    'min_samples_split': [2, 5, 10,20,25],
    'min_samples_leaf': [1, 2,3,4,5],
    'max_depth': [None, 5,10, 15,20,25],

}
decisiontree_grid= GridSearchCV(DecisionTreeRegressor(),param_grid,cv=3,n_jobs=-1,scoring='r2')
decisiontree_grid.fit(df_train_features, df_train_label)
bestparams=decisiontree_grid.best_params_'''

In [None]:
#bestparams

From the above code the hyperparameters came out to be {'max_depth': 15,
 'max_features': None,
 'min_samples_leaf': 2,
 'min_samples_split': 25}

In [None]:
decisiontree=DecisionTreeRegressor(max_depth= 15, max_features= None, min_samples_leaf= 2,min_samples_split=25)
decisiontree.fit(df_train_features,df_train_label)

In [None]:
decisiontree.score(df_test_features,df_test_label)

The decision tree model is giving score of 0.9456630876919249 on the test set from the train dataset. The kaggle submission score came out to be 0.92711. Indicating that the model may be slighlty overfit on the train data.

**RandomForest Regressor**

In [None]:
'''param_grid = {
    'n_estimators' : [50,100,200],
    'max_features': [1.0, 'sqrt', 'log2'],
    'max_depth': [None, 10, 20],

}
randomforest_grid= GridSearchCV(RandomForestRegressor(),param_grid,cv=3,n_jobs=-1,scoring='r2')
randomforest_grid.fit(df_train_features, df_train_label)
bestparams=randomforest_grid.best_params_'''

In [None]:
#bestparams

Using the above code  to tune the hyper parametrs it is found that best hyperparameter for RandomForestRegressor is {'max_depth': 20, 'max_features': 1.0, 'n_estimators': 200}

In [None]:
randomforest=RandomForestRegressor(max_depth= 20, max_features= 1.0, n_estimators= 200)
randomforest.fit(df_train_features,df_train_label)

In [None]:
randomforest.score(df_test_features,df_test_label)

for my test sample from the train dataset, the score is coming out to be 0.9644980844207918. So I will be submitting it to competition to check for the tran.csv dataset. The kaggle submission score for this model is 0.94556.

****

# Loading the test.csv dataset and performing the transformations

In [None]:
X_test=pd.read_csv("/kaggle/input/taxi-fare-guru-total-amount-prediction-challenge/test.csv")

In [None]:
X_test.groupby('RatecodeID').apply(len)


In [None]:
X_test['RatecodeID']=X_test['RatecodeID'].replace(6,1)

In [None]:
X_test[['tpep_pickup_datetime', 'tpep_dropoff_datetime']] = X_test[['tpep_pickup_datetime', 'tpep_dropoff_datetime']].apply(
    pd.to_datetime, format='%Y-%m-%d %H:%M:%S', errors='raise')
X_test['time_difference_minutes'] = abs((X_test['tpep_dropoff_datetime'] - X_test['tpep_pickup_datetime']).dt.total_seconds() / 60)
X_test['hour_of_pickup'] = X_test['tpep_pickup_datetime'].dt.hour
X_test['day_of_pickup'] = X_test['tpep_pickup_datetime'].dt.dayofweek
X_test['month_of_pickup'] = X_test['tpep_pickup_datetime'].dt.month

X_test=X_test.drop(['tpep_pickup_datetime','tpep_dropoff_datetime'],axis = 1)

X_test = ct1.transform(X_test)
#X_test=X_test.drop(['RatecodeID_6.0'], axis =1)
X_test = ct2.transform(X_test)

**Predicting on the kaggle test dataset**

In [None]:
y_pred = xgboostreg_gb.predict(X_test)

In [None]:

#X_test=pd.read_csv("/kaggle/input/taxi-fare-guru-total-amount-prediction-challenge/test.csv")
# Make predictions using the Dummy Regressor
#y_pred = dummy_regressor.predict(X_test)

In [None]:
submission = pd.DataFrame(columns = ["ID","total_amount"])
submission["ID"] = [i for i in range(1,len(y_pred)+1)]
submission["total_amount"] = y_pred
submission.to_csv('submission.csv',index=False)

# Findings and results

From my analysis the top performing models were Gradient Boosting Regressor,
XGboost, Decision tree Regressor and Random Forest Regressor.
 Of which Gradient Boosting gave me a good score with the hyperparameters  {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200}. The Grid search for hyperparameters for xgboost were taking a lot of time. And as core ideology behind XGboost and Gradiet boost are same I tried using the same hyperparameters with the xgboost model to get my best score. Later I did the grid search on XGBoost and found that the score on kaggle submission was slightly lower from the parameters obtained in grid search compared to that of gradient boost. So as of now the best performin model is the model named 'xgboostreg_gb'. The KNN model with gridsearch in my finding had the optimal number of neighbour as 3 which when fitted to train data did not give enough score against the baseline model.SVR proved to be computationally expensive in my case.Without any Hyperparameter tuning the score on my test data was close to that of baseline model while the score on kaggle submission was also less than that of baseline model. So I did not investigate it further using hyperparameter tuning. Decision tree and RandomForest were the other top performing models which gave me good score on kaggle submission. Where among those two RandomForest was the better one.