In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

In [2]:
train = pd.read_csv('../data/train_store_combined.csv')
test = pd.read_csv('../data/test_store_combined.csv')

In [3]:
def isWeekend(x):
    if x<6:
        return 0
    else: 
        return 1
train["weekend"]= train["DayOfWeek"].apply(isWeekend )

In [4]:
def startMidEndMonth(x):
    if x<10:
        return 0
    elif x<20:
        return 1
    else:
        return 2

In [5]:
train["MonthState"]=train["Day"].apply(startMidEndMonth)

In [6]:
import bisect
train["Date"]=pd.to_datetime(train["Date"])
dates=np.array(train[train["StateHoliday"]!="0"]["Date"].unique())
dates=np.sort(dates)
a=train["Date"].iloc[119]

index = bisect.bisect(dates, a)
print(index,dates[index-1],dates[index],a)

32 2015-04-03T00:00:00.000000000 2015-04-06T00:00:00.000000000 2015-04-03 00:00:00


In [7]:
def datToAndAfterHoliday(df,Column,holidays):
    to=[]
    after=[]
    for a in df[Column]:
        index=bisect.bisect(holidays,a)
        if len(holidays)==index:
            to.append(pd.Timedelta(0, unit='d') )
            after.append(a - holidays[index-1])
        else:
            after.append(holidays[index] - a)
            to.append(a -holidays[index-1])
    return to,after

In [8]:
train["To"],train["After"]=datToAndAfterHoliday(train,"Date",dates)

In [9]:
train['Promo2SinceWeek'] = train['Promo2SinceWeek'].fillna(train['Promo2SinceWeek'].max())
train['Promo2SinceYear'] = train['Promo2SinceYear'].fillna(train['Promo2SinceYear'].max())
train['PromoInterval'] = train['PromoInterval'].fillna(train['PromoInterval'].mode().iloc[0])

train['CompetitionDistance'] = train['CompetitionDistance'].fillna(train['CompetitionDistance'].max())
train['CompetitionOpenSinceMonth'] = train['CompetitionOpenSinceMonth'].fillna(train['CompetitionOpenSinceMonth'].mode().iloc[0])
train['CompetitionOpenSinceYear'] = train['CompetitionOpenSinceYear'].fillna(train['CompetitionOpenSinceYear'].mode().iloc[0])

In [10]:
# train.to_csv("../data/test_store_combined.csv",index=False)

In [11]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1017209 entries, 0 to 1017208
Data columns (total 25 columns):
 #   Column                     Non-Null Count    Dtype          
---  ------                     --------------    -----          
 0   Store                      1017209 non-null  int64          
 1   DayOfWeek                  1017209 non-null  int64          
 2   Date                       1017209 non-null  datetime64[ns] 
 3   Sales                      1017209 non-null  int64          
 4   Customers                  1017209 non-null  int64          
 5   Open                       1017209 non-null  int64          
 6   Promo                      1017209 non-null  int64          
 7   StateHoliday               1017209 non-null  object         
 8   SchoolHoliday              1017209 non-null  int64          
 9   Year                       1017209 non-null  int64          
 10  Month                      1017209 non-null  int64          
 11  Day                     

In [12]:
train_cleaned_ready=train.copy()

In [13]:
train_cleaned_ready["To"]=pd.to_numeric(train_cleaned_ready['To'].dt.days, downcast='integer')

In [14]:
train_cleaned_ready["After"]=pd.to_numeric(train_cleaned_ready['After'].dt.days, downcast='integer')

In [15]:
train_cleaned_ready.drop(["Store","Date","Customers"],axis=1,inplace=True)

In [16]:
def encode_scale_features(df,columns):
    lb=LabelEncoder()
    norm = StandardScaler()
    for i in columns:
        df[i]=lb.fit_transform(df[i])   
    norm_fit = norm.fit_transform(df)
    out=pd.DataFrame(norm_fit,columns=df.columns)
    return out

In [17]:
features=encode_scale_features(train_cleaned_ready,["StoreType","StateHoliday","Assortment","PromoInterval"])

In [18]:
y=features["Sales"]

In [19]:
features=features.drop("Sales",axis=1,inplace=False)

In [20]:


import sys, os

sys.path.append(os.path.abspath(os.path.join('../scripts')))
from Create_modelss_modified import CreateModel



In [21]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [22]:
import mlflow
import mlflow.sklearn
from sklearn.metrics import mean_squared_error

In [31]:
X_train, X_test, y_train, y_test = train_test_split(
   features, y, test_size=0.4, random_state=1)

In [24]:

mlflow.sklearn.autolog()

with mlflow.start_run(run_name="Baseline_LinearRegression"):
    model=LinearRegression()
    model.fit(X_train, y_train)

    pred = model.predict(X_test)
    
    error=mean_squared_error(y_test,pred)
    print(error)

0.44153666592591206


In [25]:
linearmodel=CreateModel(X_train=X_train,X_test=X_test, y_train=y_train,y_test=y_test,data_version="v1",name="LinearRegressionModel",model=LinearRegression)

In [26]:
linearmodel.train()


mean_squared_error of model is  0.44153666592591206


LinearRegression()

In [27]:
params = {'fit_intercept': [True,False],'normalize':[True,False]}

linearmodel.hyperParameterTune(5,search_space=params)

2021/07/28 18:56:14 INFO mlflow.sklearn.utils: Logging the 5 best runs, no runs will be omitted.


mean_squared_error of model is  0.4415366619007456


LinearRegression(fit_intercept=False, normalize=True)

In [32]:
from sklearn.ensemble import RandomForestRegressor

param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}

regressionModel=CreateModel(X_train=X_train,X_test=X_test, y_train=y_train,y_test=y_test,data_version="v1",name="RandomForestRegressorModel",model=RandomForestRegressor)  




In [33]:
regressionModel.train()