In [None]:
import pandas as pd
import numpy as np
import gc
import xgboost as xgb
import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
pd.set_option('display.max_rows',500)
pd.set_option('display.max_columns',500)
default_color=sns.color_palette()[0]

In [None]:
from sklearn.cluster import KMeans
from sklearn.model_selection import GroupKFold
from sklearn import metrics
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import KFold

In [None]:
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

In [None]:
def smape(y_true,y_pred):
    numerator=np.abs(y_true-y_pred)
    denominator=(np.abs(y_true)+np.abs(y_pred))/2
    return 100*np.mean(numerator/denominator)

In [None]:
train=pd.read_csv('train.csv',parse_dates=['first_day_of_month'])
test=pd.read_csv('test.csv', parse_dates=['first_day_of_month'])
cs_data=pd.read_csv('census_starter.csv')
sub=pd.read_csv('sample_submission.csv')

In [None]:
print(train.shape)
print(test.shape)
print(sub.shape)
print(cs_data.shape)

In [None]:
#print('The total shape of the train data is:{} '.format((train.shape)))
#print('The total shape of the test data is: {}'.format((test.shape)))
#print('The total shape of the census data is: {}'.format(cs_data))


In [None]:
train.head()

In [None]:
test.head()

In [None]:
cs_data.head()

In [None]:
train.info()

In [None]:
test.info()

In [None]:
cs_data.info()

In [None]:
plt.title('The target variable distribution')
train['microbusiness_density'].hist(bins=20)

In [None]:
plt.figure(figsize=[20,10])
plt.subplot(1,2,1)
train['microbusiness_density'].hist(bins=20)
plt.subplot(1,2,2)
(np.log1p(train['microbusiness_density']).hist(bins=20))

In [None]:
train['microbusiness_density'].describe()

In [None]:
tar_feat=train[['row_id','microbusiness_density']]
train['first_day_of_month'].describe(datetime_is_numeric=True)

In [None]:
test['first_day_of_month'].describe(datetime_is_numeric=True)
train.plot('first_day_of_month','microbusiness_density')

In [None]:
print('The total number of cfips train data is: {}'.format(train['cfips'].nunique()))
print('The total number of cfips test data is: {}'.format(test['cfips'].nunique()))
print('The total number of cfips census data is: {}'.format(cs_data['cfips'].nunique()))

In [None]:
test['cfips'].isin(train['cfips']).value_counts()

In [None]:
cs_data['cfips'].isin(train['cfips']).value_counts()

In [None]:
train.head(10)

In [None]:
def make_feature(df):
    feature=pd.DataFrame()
    feature["county_code"]=df['cfips']//100
    feature["state_code"]=df['cfips']%100
    feature['year']=df['first_day_of_month'].dt.year
    feature['month']=df['first_day_of_month'].dt.month
    feature['day']=df['first_day_of_month'].dt.dayofweek
    return feature

In [None]:
train_fe=make_feature(train)
test_fe=make_feature(test)


In [None]:
print(test_fe.shape)
print(train_fe.shape)

In [None]:
X=train_fe
Y=np.log1p(train['microbusiness_density'])

In [None]:
errcb=[]
Y_pred_totcb=[]
fold=KFold(n_splits=5,shuffle=True,random_state=1)
i=1
for train_index,test_index in fold.split(X):
    X_train,X_test=X.iloc[train_index],X.iloc[test_index]
    Y_train,Y_test=Y.iloc[train_index],Y.iloc[test_index]
    m=CatBoostRegressor(eval_metric='SMAPE')
    m.fit(X_train,Y_train,eval_set=[(X_train,Y_train),(X_test,Y_test)],verbose=100,early_stopping_rounds=100)
    preds=m.predict(X_test)
    print("err:",smape(np.expm1(Y_test),np.expm1(preds)))
    errcb.append(smape(np.expm1(Y_test),np.expm1(preds)))
    p=m.predict(test_fe)
    Y_pred_totcb.append(p)

In [None]:
np.mean((errcb))

In [None]:
params={
    "metric":"mse",
    "learning_rate":0.2,
    "sub_feature":1.0,
    "bagging_freq":1,
    "lambda_11":0.6,
    'verbosity':1,
    'num_iterations':3000,
}

In [None]:
errlgb=[]
Y_pred_totlgb=[]
fold=KFold(n_splits=5,shuffle=True,random_state=101)
i=1
for train_index,test_index in fold.split(X):
    X_train,X_test=X.iloc[train_index],X.iloc[test_index]
    Y_train,Y_test=Y.iloc[train_index],Y.iloc[test_index]
    m=LGBMRegressor(**params)
    m.fit(X_train,Y_train,eval_set=[(X_train,Y_train),(X_test,Y_test)],verbose=100,early_stopping_rounds=100)
    preds=m.predict(X_test)
    print("err:",smape(np.expm1(Y_test),np.expm1(preds)))
    errlgb.append(smape(np.expm1(Y_test),np.expm1(preds)))
    p=m.predict(test_fe)
    Y_pred_totlgb.append(p)

In [None]:
np.mean((errlgb))

In [None]:
fea_imp=pd.DataFrame({'imp':m.feature_importances_,'col':X.columns})
fea_imp=fea_imp.sort_values(['imp','col'],ascending=False).iloc[-30:]