#  Introduction

<p>
    In the G2M insight for Cab Investment firm ( Must for all Specialization) project, we will show which company (Yellow Cab or Pink Cab) is the better opportunity to invest in it.
</p>
<p>
    Here is some points before checking the results out:
    <ul>
        <li>
            We use 5 datasets, "US Bank holidays.csv" is added dataset which describes US holidays :
            <ul>
                <li>Cab_Data.csv</li>
                <li>City.csv</li>
                <li>Customer_ID.csv</li>
                <li>Transaction_ID.csv</li>
                <li>US Bank holidays.csv</li>
            </ul>
        </li>
        <br>
        <li>Profit is defined by me: Profit = Price Charged - Cost of Trip</li>
        <br>
        <li>
            "df_total" is a dataframe is created by merging mentioned 5 dataframes :
            <ul>
                <li>
                    The days which is not a holiday, has "-" value in "Holiday" column
                </li>
                <li>
                    After Merging if any record has at least 1 NaN value, the whole record is dropped. it means we do not have enough information of that trip
                </li>
            </ul>
        </li>
        <br>
        <li>Pickle is used for efficient working with jupyter, the parts related to "pkl" are commented</li>
        <br>
        <li>Hypothesis and Conclusions are provided in the other notebook</li>
        <br>
        <li>The profiling method helps me find informative data that guide me to take an analysis on the specific area and plot them, so profiling results are not discussed in detail but the files are attached, you can check it</li>
        <br>
    </ul>
</p>

# Codes

## Imports

In [1]:
import numpy as np
import pandas as pd
import plotly.offline as py
import plotly.graph_objects as go
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt
import statistics as st
import geopy
from geopy import distance
import math
import folium
from folium.plugins import HeatMap
from scipy.optimize import curve_fit
import scipy
import random
import itertools
import plotly.express as px
from pandas_profiling import ProfileReport
from sklearn.linear_model import LinearRegression, Ridge, SGDRegressor
from sklearn.metrics import auc, accuracy_score, confusion_matrix, mean_squared_error
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold, RandomizedSearchCV, train_test_split
from scipy.stats import uniform, randint
import xgboost as xgb
import sklearn.model_selection
import datetime
import pickle

## Functions

In [2]:
def change_categorical_to_numerical (df):
    for column in df.select_dtypes(['object']).columns:
        df[column]= df[column].astype('category')
        df[column] = df[column].cat.codes
    return df

In [3]:
def report_best_scores(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

## Reading Data

In [3]:
df_cab = pd.read_csv('Data/Cab_Data.csv')
df_city = pd.read_csv('Data/City.csv')
df_customer = pd.read_csv('Data/Customer_ID.csv')
df_trans = pd.read_csv('Data/Transaction_ID.csv')
df_holidays = pd.read_csv('Data/US Bank holidays.csv')

In [5]:
# df_total = pd.read_pickle("df_total.pkl")

## Describe data

### df_cab

In [5]:
df_cab['Date of Travel'] = df_cab['Date of Travel'].apply(lambda x: datetime.date(1899, 12, 30)+datetime.timedelta(days=x))
df_cab['Date of Travel'] = pd.to_datetime(df_cab['Date of Travel'], infer_datetime_format=True)
# df_cab = df_cab.rename(columns={"Date of Travel": "Date"})

In [6]:
preview the data
df_cab.head()

In [7]:
df_cab.tail()

In [8]:
df_cab.info()

In [9]:
 df_cab.describe()

In [10]:
distribution of categorical features
df_cab.describe(include=['O'])

### df_city

In [11]:
preview the data
df_city.head()

In [12]:
df_city.tail()

In [13]:
df_city.info()

In [14]:
 df_city.describe()

In [15]:
#distribution of categorical features
df_city.describe(include=['O'])

### df_customer

In [16]:
preview the data
df_customer.head()

In [17]:
df_customer.tail()

In [18]:
df_customer.info()

In [19]:
 df_customer.describe()

In [20]:
distribution of categorical features
df_customer.describe(include=['O'])

### df_trans

In [21]:
preview the data
df_trans.head()

In [22]:
df_trans.tail()

In [23]:
df_trans.info()

In [24]:
 df_trans.describe()

In [25]:
distribution of categorical features
df_trans.describe(include=['O'])

### df_holidays

In [26]:
df_holidays['Date'] = pd.to_datetime(df_holidays['Date'], infer_datetime_format=True)
df_holidays

## Merge

In [27]:
df_total = df_cab.merge(df_trans, how='outer').merge(df_customer, how='outer').merge(df_city, how='outer')
df_temp = df_cab.merge(df_holidays, how='outer', left_on='Date of Travel', right_on='Date')
df_temp = df_temp.fillna("-")
df_total = df_total.join(df_temp[['Holiday']])
df_total = df_total.sort_values(by='Date of Travel', ascending=True).reset_index(drop=True)
df_total.head()

In [28]:
df_total.tail()

In [29]:
df_total.info()

In [30]:
 df_total.describe()

In [31]:
distribution of categorical features
df_total.describe(include=['O'])

In [32]:
df_total.tail()

In [33]:
df_total['Profit'] = df_total['Price Charged'] - df_total['Cost of Trip']
df_total['Year'] = df_total['Date of Travel'].dt.year
df_total['Month'] = df_total['Date of Travel'].dt.month
df_total['Day of Week'] = df_total['Date of Travel'].dt.day_name()

## NULL checking

In [34]:
# Null checking
df_total = df_total.dropna()
# df_total.to_pickle("df_total.pkl")

## Profiling

In [35]:
profile_YellowCab= ProfileReport(df_total[df_total['Company'] == 'Yellow Cab'], title='Yellow Cab - Profiling Report', explorative=True)
profile_YellowCab.to_file("Yellow Cab - Profiling Report.html")

In [36]:
profile_PinkCab= ProfileReport(df_total[df_total['Company'] == 'Pink Cab'], title='Pink Cab - Profiling Report', explorative=True)
profile_PinkCab.to_file("Pink Cab - Profiling Report.html")

## EDA and Plots

### EDA

In [37]:
df_total.groupby(['Company']).mean().T

In [38]:
df_total.groupby(['Company'])[['Date of Travel', 'KM Travelled', 'Profit']].describe().T

In [39]:
df_total.groupby(['Company']).describe(include=['O']).T

### Histograms

In [160]:
fig = px.histogram(df_total, x="Customer ID", color="Company", title="Customers Histogram - Check Retention of Customers by Company", color_discrete_map={'(?)':'black', 'Pink Cab':'#FC1CBF', 'Yellow Cab':'#FBE426'})
fig.write_html("Plot/Customers Histogram - Check Retention of Customers by Company.html", include_plotlyjs="cdn")

In [161]:
fig = px.histogram(df_total, x="Company", color="City", facet_row="Year", title="Cities Histogram - Check Count of Customers in Each Cities by Company")
fig.write_html("Plot/Cities Histogram - Check Count of Customers in Each Cities by Company.html", include_plotlyjs="cdn")

In [162]:
fig = px.histogram(df_total[df_total['Holiday'] != "-"], x="Holiday", color="City", facet_col="Company", facet_row="Year", title="Holiday's Transactions of Different Cities by Company in 3 Years")
fig.write_html("Plot/Holiday's Transactions of Different Cities by Company in 3 Years.html", include_plotlyjs="cdn")

### Line Plot

In [164]:
df_temp = df_total.groupby(['Company', 'Day of Week']).sum()[['Profit']].reset_index()
#create a mapping of the sort order
sortbox = {'Monday':1,'Tuesday':2,'Wednesday':3,'Thursday':4,'Friday':5, 'Saturday':6,'Sunday':7}

#create new column with the sort order
df_temp['sort_column'] = df_temp['Day of Week'].map(sortbox)

#sort with sort_column

df_temp = df_temp.sort_values('sort_column').drop('sort_column',axis=1).reset_index(drop=True)

fig = px.line(df_temp, x="Day of Week", y="Profit", color= "Company", title="Days of Week's Sum of Profit by Company", color_discrete_map={'(?)':'black', 'Pink Cab':'#FC1CBF', 'Yellow Cab':'#cca300'})
fig.update_traces(mode='markers+lines')
fig.write_html("Plot/Days of Week's Sum of Profit by Company.html", include_plotlyjs="cdn")

In [165]:
df_temp = df_total.groupby(['Year', 'Company']).sum()[['Profit']].reset_index()
fig = px.line(df_temp, x="Year", y="Profit", color= "Company", title="Company's Profit by Year", color_discrete_map={'(?)':'black', 'Pink Cab':'#FC1CBF', 'Yellow Cab':'#cca300'})
fig.write_html("Plot/Company's Profit by Year.html", include_plotlyjs="cdn")

### Bar Plot

In [166]:
df_temp = df_total.groupby(['Company', 'Year']).sum()[['Profit']].reset_index()
df_temp1 = df_total.groupby(['Company', 'Year']).sum()[['KM Travelled']].reset_index()
df_temp = df_temp.merge(df_temp1)
fig = px.bar(df_temp, x="KM Travelled", y="Profit", color= "Company", title="Company's Profit vs KM Travelled by Year", facet_col="Year" ,color_discrete_map={'(?)':'black', 'Pink Cab':'#FC1CBF', 'Yellow Cab':'#FBE426'})
fig.write_html("Plot/Company's Profit vs KM Travelledr by Year.html", include_plotlyjs="cdn")

In [167]:
df_temp = df_total.copy()
df_temp['Profit/KM'] = df_temp['Profit']/df_temp['KM Travelled']
df_temp = df_temp.groupby(['Company', 'Year']).mean()[['Profit/KM']].reset_index()
fig = px.bar(df_temp, x="Company", y="Profit/KM", color= "Company", title="Profit/Distance by Year", facet_col="Year" ,color_discrete_map={'(?)':'black', 'Pink Cab':'#FC1CBF', 'Yellow Cab':'#FBE426'})
fig.update_yaxes(title_text='Profit/Distance($/KM)')
fig.write_html("Plot/Profit-Distance by Year.html", include_plotlyjs="cdn")

In [168]:
df_temp = df_total.groupby(['Company', 'Day of Week']).count()[['Transaction ID']].reset_index()
#create a mapping of the sort order
sortbox = {'Monday':1,'Tuesday':2,'Wednesday':3,'Thursday':4,'Friday':5, 'Saturday':6,'Sunday':7}

#create new column with the sort order
df_temp['sort_column'] = df_temp['Day of Week'].map(sortbox)

#sort with sort_column

df_temp = df_temp.sort_values('sort_column').drop('sort_column',axis=1).reset_index(drop=True)

fig = px.bar(df_temp, x="Day of Week", y="Transaction ID", color= "Company", barmode='group', title="Count of trips by Days of week",
             color_discrete_map={'(?)':'black', 'Pink Cab':'#FC1CBF', 'Yellow Cab':'#FBE426'})
fig.update_yaxes(title_text='Count of Trips')
fig.write_html("Plot/Count of trips by Days of week.html", include_plotlyjs="cdn")

In [169]:
df_temp = df_total.groupby(['Company', 'Day of Week']).count()[['Transaction ID']].reset_index()
df_temp1 = df_total.groupby(['Day of Week']).count()[['Profit']].reset_index()
df_temp = df_temp.merge(df_temp1)
df_temp['Percent'] = (df_temp['Transaction ID']/df_temp['Profit'])*100
#create a mapping of the sort order
sortbox = {'Monday':1,'Tuesday':2,'Wednesday':3,'Thursday':4,'Friday':5, 'Saturday':6,'Sunday':7}

#create new column with the sort order
df_temp['sort_column'] = df_temp['Day of Week'].map(sortbox)

#sort with sort_column

df_temp = df_temp.sort_values('sort_column').drop('sort_column',axis=1).reset_index(drop=True)

fig = px.bar(df_temp, x="Day of Week", y="Percent", color= "Company", barmode='group', title="Market Share by Days of week",
             color_discrete_map={'(?)':'black', 'Pink Cab':'#FC1CBF', 'Yellow Cab':'#FBE426'})
fig.update_yaxes(title_text='Market share-Count of trips(%)')
fig.write_html("Plot/Market Share by Days of week.html", include_plotlyjs="cdn")

In [170]:
df_temp = df_total.groupby(['Company', 'Customer ID']).count()[['Users']].reset_index()
fig = px.histogram(df_temp, x="Users", color= "Company", barmode='group', title="User Retention",
             color_discrete_map={'(?)':'black', 'Pink Cab':'#FC1CBF', 'Yellow Cab':'#FBE426'})
fig.update_yaxes(title_text='Count of Users')
fig.update_xaxes(title_text='Number of trips')
fig.write_html("Plot/User Retention.html", include_plotlyjs="cdn")

### Pie Chart

In [171]:
df_temp = df_total.groupby(['City','Company']).sum()[['Profit']].reset_index()
df_temp1 = df_total.groupby(['City','Company']).mean()[['Income (USD/Month)']].reset_index()
df_temp = df_temp.merge(df_temp1)
fig = px.sunburst(df_temp, path=['City', 'Company'], values='Profit', names='City',title='Sum of Profit of Companies in Different Cities - Check mean of Income of Customers',
                  color='City', hover_data=['Income (USD/Month)'], color_continuous_scale='RdBu')
fig.write_html("Plot/Sum of Profit of Companies in Different Cities - Check mean of Income of Customers.html", include_plotlyjs="cdn")

In [172]:
df_temp = df_total.groupby(['Company', 'Gender']).sum()[['Profit']].reset_index()
df_temp1 = df_total.groupby(['Company', 'Gender']).count()[['Customer ID']].reset_index()
df_temp = df_temp.merge(df_temp1)
fig = px.sunburst(df_temp, path=['Company', 'Gender'], values='Customer ID', names='Gender',title='Count of Customers by Company&Gender  - Check Sum of Profit of Each Gender by Company',
                  color='Company', hover_data=['Profit'], color_continuous_scale='RdBu', color_discrete_map={'(?)':'black', 'Pink Cab':'#FC1CBF', 'Yellow Cab':'#FBE426'})
fig.write_html("Plot/Count of Customers by Company&Gender  - Check Sum of Profit of Each Gender by Company.html", include_plotlyjs="cdn")

## Model deployment

In [69]:
df_total_model = pd.get_dummies(df_total)
df_total_model.head()

Unnamed: 0,Transaction ID,Date of Travel,KM Travelled,Price Charged,Cost of Trip,Customer ID,Age,Income (USD/Month),Population,Users,...,Holiday_Presidents Day (Washingtons Birthday),Holiday_Thanksgiving Day,Holiday_Veterans Day,Day of Week_Friday,Day of Week_Monday,Day of Week_Saturday,Day of Week_Sunday,Day of Week_Thursday,Day of Week_Tuesday,Day of Week_Wednesday
0,10000429.0,2016-01-02,15.15,342.62,205.434,57474.0,34.0,16558.0,248968.0,80021.0,...,0,0,0,0,0,1,0,0,0,0
1,10000525.0,2016-01-02,2.18,51.47,26.4216,4551.0,19.0,6316.0,1955130.0,164468.0,...,0,0,0,0,0,1,0,0,0,0
4,10000927.0,2016-01-02,34.56,1121.11,485.2224,1808.0,59.0,18999.0,8405837.0,302149.0,...,0,0,0,0,0,1,0,0,0,0
5,10000721.0,2016-01-02,19.2,529.23,246.528,8117.0,21.0,5946.0,1595037.0,144132.0,...,0,0,0,0,0,1,0,0,0,0
7,10000519.0,2016-01-02,13.92,327.23,185.4144,4429.0,20.0,23387.0,1955130.0,164468.0,...,0,0,0,0,0,1,0,0,0,0


In [73]:
# Linear Correlation
corr = df_total_model.corr()
corr

Unnamed: 0,Transaction ID,KM Travelled,Price Charged,Cost of Trip,Customer ID,Age,Income (USD/Month),Population,Users,Profit,...,Holiday_Presidents Day (Washingtons Birthday),Holiday_Thanksgiving Day,Holiday_Veterans Day,Day of Week_Friday,Day of Week_Monday,Day of Week_Saturday,Day of Week_Sunday,Day of Week_Thursday,Day of Week_Tuesday,Day of Week_Wednesday
Transaction ID,1.0,-0.002233,-0.056536,-0.00435,-0.012192,-0.001944,-0.000768,0.024949,0.01518,-0.090022,...,-0.008105,0.007372,0.000259,0.14805,0.001003,0.003813,-0.163395,-0.00073,0.003199,0.001466
KM Travelled,-0.002233,1.0,0.827852,0.982059,-0.000457,0.000148,-6.9e-05,-0.002704,-0.000566,0.475751,...,0.000803,0.004462,0.00503,0.000461,-0.002123,-0.001005,0.002067,-0.00276,0.005093,-0.001993
Price Charged,-0.056536,0.827852,1.0,0.854413,-0.173113,-0.004306,0.002984,0.339265,0.292723,0.879103,...,0.002894,0.005755,0.005297,0.016092,-0.021981,0.013099,0.036692,-0.029552,-0.017807,-0.023359
Cost of Trip,-0.00435,0.982059,0.854413,1.0,-0.008518,0.000257,0.000242,0.018565,0.021942,0.503462,...,0.000125,0.005293,0.005828,0.000132,-0.001782,-0.000737,0.00213,-0.00347,0.005101,-0.001532
Customer ID,-0.012192,-0.000457,-0.173113,-0.008518,1.0,-0.00441,-0.005401,-0.598449,-0.67902,-0.280051,...,-0.00045,-0.029573,-0.025307,-0.000955,-0.001917,-0.000145,0.000423,0.001899,-0.000575,0.001385
Age,-0.001944,0.000148,-0.004306,0.000257,-0.00441,1.0,0.003598,-0.013805,-0.010006,-0.007396,...,-0.004833,0.013384,0.012578,-0.000688,-0.000311,0.001533,-0.003013,0.001048,2.3e-05,0.002174
Income (USD/Month),-0.000768,-6.9e-05,0.002984,0.000242,-0.005401,0.003598,1.0,0.008709,0.010389,0.00474,...,0.010193,0.012832,0.011901,0.003695,0.000819,-0.003209,-0.001139,0.00056,0.000796,-0.001411
Population,0.024949,-0.002704,0.339265,0.018565,-0.598449,-0.013805,0.008709,1.0,0.932558,0.547125,...,-0.003105,0.024197,0.013027,-0.000423,0.00209,-0.001072,0.001091,-0.002779,-0.000394,0.002124
Users,0.01518,-0.000566,0.292723,0.021942,-0.67902,-0.010006,0.010389,0.932558,1.0,0.466632,...,-0.006656,0.030349,0.02095,-0.000389,0.002156,7.1e-05,0.000296,-0.002365,-0.000395,0.000975
Profit,-0.090022,0.475751,0.879103,0.503462,-0.280051,-0.007396,0.00474,0.547125,0.466632,1.0,...,0.004698,0.004714,0.003461,0.026638,-0.034917,0.022459,0.05906,-0.045958,-0.034289,-0.037437


In [75]:
corr[['Profit']]

Unnamed: 0,Profit
Transaction ID,-0.090022
KM Travelled,0.475751
Price Charged,0.879103
Cost of Trip,0.503462
Customer ID,-0.280051
Age,-0.007396
Income (USD/Month),0.00474
Population,0.547125
Users,0.466632
Profit,1.0


In [94]:
np.abs(corr[['Profit']]).nlargest(20, 'Profit').reset_index()

Unnamed: 0,index,Profit
0,Profit,1.0
1,Price Charged,0.879103
2,City_NEW YORK NY,0.554924
3,Population,0.547125
4,Cost of Trip,0.503462
5,KM Travelled,0.475751
6,Users,0.466632
7,Company_Pink Cab,0.282265
8,Company_Yellow Cab,0.282265
9,Customer ID,0.280051


In [98]:
X_train = df_total_model[df_total_model['Year'] != 2018][np.abs(corr[['Profit']]).nlargest(20, 'Profit').reset_index()['index'][2:]]
y_train = df_total_model[df_total_model['Year'] != 2018]['Profit']
X_test = df_total_model[df_total_model['Year'] == 2018][np.abs(corr[['Profit']]).nlargest(20, 'Profit').reset_index()['index'][2:]]
y_test = df_total_model[df_total_model['Year'] == 2018]['Profit']

### Linear Regression

In [99]:
reg = LinearRegression().fit(X_train, y_train)
reg.score(X_test, y_test)

0.576227478461399

In [101]:
y_pred = reg.predict(X_test)
df_linear = pd.DataFrame({"y": y_test, "y_predict": y_pred})
df_linear

Unnamed: 0,y,y_predict
235829,7.8380,-81.707107
235830,149.5752,454.158118
235833,139.0200,119.869932
235835,31.3500,407.651480
235836,141.7480,104.980700
...,...,...
359386,191.6800,296.840594
359388,5.6100,-90.418499
359389,47.2500,7.147884
359390,4.1720,-122.619440


In [102]:
mse=mean_squared_error(y_test, y_pred)
print(np.sqrt(mse))

101.75954760737831


### Ridge

In [103]:
reg = Ridge().fit(X_train, y_train)
reg.score(X_test, y_test)


Ill-conditioned matrix (rcond=4.45367e-19): result may not be accurate.



0.5763882412556443

In [104]:
y_pred = reg.predict(X_test)
df_linear = pd.DataFrame({"y": y_test, "y_predict": y_pred})
df_linear

Unnamed: 0,y,y_predict
235829,7.8380,-81.667389
235830,149.5752,453.790558
235833,139.0200,119.861426
235835,31.3500,407.267214
235836,141.7480,104.942546
...,...,...
359386,191.6800,296.804161
359388,5.6100,-90.297378
359389,47.2500,7.159827
359390,4.1720,-122.656870


In [105]:
mse=mean_squared_error(y_test, y_pred)
print(np.sqrt(mse))

101.7402439713668


### XGBoost

In [106]:
xgb_model = xgb.XGBRegressor(objective="reg:squarederror", random_state=42).fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)
mse=mean_squared_error(y_test, y_pred)
print(np.sqrt(mse))

92.4144328314398


In [107]:
df_xgboost = pd.DataFrame({"y": y_test, "y_predict": y_pred})
df_xgboost

Unnamed: 0,y,y_predict
235829,7.8380,23.888582
235830,149.5752,318.049042
235833,139.0200,252.371826
235835,31.3500,210.548920
235836,141.7480,238.925690
...,...,...
359386,191.6800,303.773682
359388,5.6100,52.974232
359389,47.2500,143.789902
359390,4.1720,30.723171


#### Hyperparameter Searching for XGBoost

In [110]:
xgb_model = xgb.XGBRegressor()

params = {
    "colsample_bytree": uniform(0.7, 0.3),
    "gamma": uniform(0, 0.5),
    "learning_rate": uniform(0.03, 0.3), # default 0.1 
    "max_depth": randint(2, 6), # default 3
    "n_estimators": randint(100, 150), # default 100
    "subsample": uniform(0.6, 0.4)
}

search = RandomizedSearchCV(xgb_model, param_distributions=params, random_state=42, n_iter=20, cv=3, verbose=1, n_jobs=1, return_train_score=True)

search.fit(X_train, y_train)

report_best_scores(search.cv_results_, 1)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:  8.7min finished


Model with rank: 1
Mean validation score: 0.744 (std: 0.003)
Parameters: {'colsample_bytree': 0.8835558684167137, 'gamma': 0.06974693032602092, 'learning_rate': 0.11764339456056544, 'max_depth': 5, 'n_estimators': 114, 'subsample': 0.7824279936868144}



In [111]:
xgb_model = xgb.XGBRegressor(colsample_bytree= 0.8835558684167137, gamma= 0.06974693032602092, learning_rate= 0.11764339456056544, max_depth= 5, n_estimators= 114, subsample= 0.7824279936868144).fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)
mse=mean_squared_error(y_test, y_pred)
print(np.sqrt(mse))

70.22724510947654


In [112]:
df_xgboost = pd.DataFrame({"y": y_test, "y_predict": y_pred})
df_xgboost

Unnamed: 0,y,y_predict
235829,7.8380,18.715492
235830,149.5752,294.532898
235833,139.0200,134.078400
235835,31.3500,249.422165
235836,141.7480,133.891861
...,...,...
359386,191.6800,295.593750
359388,5.6100,25.920586
359389,47.2500,115.316391
359390,4.1720,29.219027
