### Import and Preproc

***Importing Libraries***

In [None]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pandas import DataFrame

%matplotlib inline

In [None]:
pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)

***Reading in the data***

In [None]:

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sub = pd.read_csv('samplesubmission.csv')

In [None]:
train['Date']=pd.to_datetime(train['Date'],format='%Y-%m-%d')
test['Date']=pd.to_datetime(test['Date'],format='%Y-%m-%d')


dataframe=DataFrame()

train['Year']=[train['Date'][i].year for i in range(len(train))]
train['month']=[train['Date'][i].month for i in range(len(train))]
train['day']=[train['Date'][i].day for i in range(len(train))]
train.head()


test['Year']=[test['Date'][i].year for i in range(len(test))]
test['month']=[test['Date'][i].month for i in range(len(test))]
test['day']=[test['Date'][i].day for i in range(len(test))]

In [None]:
train['min']=train['target_min']
train['max']=train['target_max']
train['variance']=train['target_variance']
train['count']=train['target_count']

test['min']=train['min']
test['max']=train['max']
test['variance']=train['variance']
test['count']=train['count']

train.drop(['target_min','target_variance','target_max','target_count'],inplace=True, axis=1)

train.head()

In [None]:
test.head()

In [None]:
train.drop(['Place_ID','Date','Place_ID X Date'],inplace=True,axis=1)
test.drop(['Place_ID','Date','Place_ID X Date'],inplace=True,axis=1)

In [None]:
train.shape, test.shape

***Dropping missing columns***

After checking for the number of null values per columns using ***train.isnull().sum()*** I realized there are some columns with very high missing values e.g greater than 16000 out of 25000 records.
so i decided to drop such columns

In [None]:
train.isnull().sum()

In [None]:
tr = pd.DataFrame(train.isnull().sum(), columns=['Na_sum'])
tr.reset_index(inplace=True)

tr_col = tr[tr['Na_sum']>15000]['index']

tr_col = tr_col.to_list()

train = train.drop(tr_col,axis=1)
test = test.drop(tr_col,axis=1)



In [None]:
len(tr_col)

In [None]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures()

from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures()
to_cross = ['min', 'max', 'variance', 'count']
crossed_feats = poly.fit_transform(train[to_cross].values)
crossed_feats = poly.fit_transform(test[to_cross].values)

#Convert to Pandas DataFrame and merge to original dataset
crossed_feats = pd.DataFrame(crossed_feats)
train = pd.concat([train, crossed_feats], axis=1)
test = pd.concat([test, crossed_feats], axis=1)


***Treating Missing Values***

In [None]:
train=train.fillna(method = 'bfill', axis=1).fillna(0)
test = test.fillna(method = 'bfill', axis=1).fillna(0)

In [None]:
train.isnull().sum()

***One Hot Encoding***

In [None]:
#test['CTR_CATEGO_X'].value_counts()

I did one hot encoding and drop the encoded column for category N since it doesn't appear in the test data

In [None]:
#train = pd.get_dummies(train, columns=['CTR_CATEGO_X'])

#test = pd.get_dummies(test, columns=['CTR_CATEGO_X'])

#train = train.drop(columns=['CTR_CATEGO_X_N'])



***Frequency Encoding***

In [None]:
#train['id'] = train['id'].map(train['id'].value_counts())

#test['id'] = test['id'].map(test['id'].value_counts())

***preparing the dataset for training***

In [None]:
X = train.drop(columns='target')
y = train['target']

In [None]:
print(X.shape)
print(test.shape)

***More importation***

In [None]:
from sklearn.model_selection import KFold, train_test_split
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor

In [None]:
import xgboost,lightgbm,catboost

### Base Model

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size = 0.05, random_state = 42)

In [None]:
X_trainss,X_testss,y_trainss,y_testss = train_test_split(X,y, test_size = 0.4, random_state = 42)

In [None]:
xgb = XGBRegressor()
lgb = LGBMRegressor()
rf = RandomForestRegressor()
cat = CatBoostRegressor()

***Modelling***

In [None]:
#xgb.fit(X_train,y_train)
xgb.fit(X_train,y_train,eval_metric = 'rmse', eval_set = [(X_train,y_train),
                                                          (X_test,y_test)],
            early_stopping_rounds = 200)

In [None]:
#lgb.fit(X_train,y_train)
lgb.fit(X_train,y_train,eval_metric = 'rmse',
            eval_set = [(X_train,y_train),(X_test,y_test)],
            early_stopping_rounds = 500)

In [None]:
#cat.fit(X_train,y_train)
cat.fit(X_train,y_train, eval_set=[(X_train,y_train),
                                   (X_test,y_test)], 
         early_stopping_rounds= 500, use_best_model=True)

In [None]:
rf.fit(X_trainss,y_trainss)

***Prediction***

In [None]:
xgbpred = xgb.predict(X_test)
lgbpred = lgb.predict(X_test)
rfpred = rf.predict(X_test)
catpred = cat.predict(X_test)

In [None]:
xgbpred_train = xgb.predict(X)
lgbpred_train = lgb.predict(X)
rfpred_train = rf.predict(X)
catpred_train = cat.predict(X)


***Evaluation***

In [None]:
from sklearn.metrics import mean_squared_error as mse
def rmse(y_true, y_pred):
    return np.sqrt(mse(y_true, y_pred))

In [None]:
print('XGB >>>',rmse(y_test,xgbpred))
print('LGB >>>',rmse(y_test,lgbpred))
print('CAT >>>',rmse(y_test,catpred))
print('RF >>>',rmse(y_test,rfpred))

In [None]:
print('XGB >>>',rmse(y,xgbpred_train))
print('LGB >>>',rmse(y,lgbpred_train))
print('CAT >>>',rmse(y,catpred_train))
print('RF >>>',rmse(y,rfpred_train))

## Feature Importance

In [None]:
features = [c for c in train.columns if c not in ['target']]

In [None]:
fi = pd.Series(index=features, data= lgb.feature_importances_)
_ = plt.figure(figsize=(10, 50))
_ = fi.sort_values().plot(kind='barh')

## Dropping Columns that doesn't affect the model

In [None]:
fi_df = pd.DataFrame(index=features, data=lgb.feature_importances_)

fi_df = fi_df.reset_index()
fi_df.columns = ['cols','imp']

In [None]:
fi_df.sort_values(by = ['imp'],ascending=False,inplace=True)

In [None]:
fi_df.tail(30)

In [None]:
semi_useless = fi_df[fi_df['imp'] < 13]['cols'].to_list()

In [None]:
train.info()

In [None]:
len(semi_useless)

In [None]:
train = train.drop(semi_useless,axis=1)
test = test.drop(semi_useless,axis=1)

In [None]:
X = train.drop(columns='target')
y = train['target']

In [None]:
print(X.shape, test.shape)

train.head()

In [None]:
#Use standard scaler
#from sklearn.preprocessing import StandardScaler


#sc = StandardScaler()
#train = sc.fit_transform(train)
#test = sc.fit_transform(test)

## Cross Validation

In [None]:



from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.30,random_state=101)
                                            
    
    
from sklearn.linear_model import LinearRegression
lm = LinearRegression()

lm.fit(X_train,y_train)

predictions = lm.predict(X_test)


accuracy1=lm.score(X_train, y_train)


accuracy=lm.score(X_test, y_test)

print(accuracy1,accuracy)

from sklearn import metrics

print('MAE:', metrics.mean_absolute_error(y_test, predictions))
print('MSE:', metrics.mean_squared_error(y_test, predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))


In [None]:
lgb = LGBMRegressor(num_leaves=200, min_data_in_leaf=3,
                    objective='regression',
                    max_depth=-1,learning_rate=0.05, 
                    boosting_type='gbdt', 
                    feature_fraction=0.60,
                    lambda_l1=1,lambda_l2=1, 
                    metric='rmse', 
                    num_iterations=4000)

xgb = XGBRegressor(n_estimators = 5000, max_depth = 30, 
                     reg_lambda = 80,random_state = 30,
                     learning_rate=0.3, gamma = 1.5)

***Lightgbm CrossVal***

In [None]:
max_iter = 23
kf = KFold(n_splits=max_iter,shuffle=False,random_state=101)

In [None]:
lgb_scores = []
lgb_test_pred = np.zeros(len(test))
lgb_train_pred = np.zeros(len(train))

for fold,(tr_in,te_in) in enumerate(kf.split(X)):
    
    print(f"==================================Fold{fold}=============================================")
    X_train,X_test = X.iloc[tr_in],X.iloc[te_in]
    y_train,y_test = y.iloc[tr_in],y.iloc[te_in]
    
    lgb.fit(X_train,y_train,eval_metric = 'rmse', eval_set = [(X_train,y_train),(X_test,y_test)],
            early_stopping_rounds = 500)
    
    lgb_scores.append(rmse(y_test,lgb.predict(X_test)))
    
    lgb_train_pred += lgb.predict(X)
    lgb_test_pred += lgb.predict(test)

In [None]:
#accuracy1=lgb.score(X_train, y_train)

#accuracy2=lgb.score(X_test, y_test)

#print(accuracy1,accuracy1=2)

In [None]:
np.mean(lgb_scores)

In [None]:
final = lgb_test_pred/23

In [None]:
sub

In [None]:
pd.DataFrame(final)

In [None]:
sub['target'] = final

sub.to_csv('LgbSubmissionn.csv', index=False)
sub.head()

In [None]:
final = np.array(final, dtype=np.int8)

In [None]:
sub['target'] = final

sub.to_csv('LgbSubmissio.csv', index=False)
sub.head()

## PC vs LB

***My best model was Lightgbm after I performed cross validation of 23 splits***

Things you can vary to get better results

* num_leaves = [100 - 200],
* min_data_in_leaf = [3, 5, 10, 20, 30, 40],
* learning_rate = [0.05,0.03,0.075], 
* feature_fraction = [0.60, 0.65, 0.50, 0.40, 0.35, 0.30],
* num_iterations = [4000, 3500, 3000, 2500, 2200]

***Xgboost CrossVAl***

In [None]:
max_iter = 20
kf = KFold(n_splits=max_iter,shuffle=False,random_state=101)

In [None]:
xgb_scores = []
xgb_test_pred = np.zeros(len(test))
xgb_train_pred = np.zeros(len(train))

for fold,(tr_in,te_in) in enumerate(kf.split(X)):
    
    print(f"==================================Fold{fold}=============================================")
    X_train,X_test = X.iloc[tr_in],X.iloc[te_in]
    y_train,y_test = y.iloc[tr_in],y.iloc[te_in]
    
    xgb.fit(X_train,y_train,eval_metric = 'rmse', eval_set = [(X_train,y_train),(X_test,y_test)],
            early_stopping_rounds = 200)
    
    xgb_scores.append(rmse(y_test,xgb.predict(X_test)))
    
    xgb_train_pred += xgb.predict(X)
    xgb_test_pred += xgb.predict(test)

In [None]:
np.mean(xgb_scores)

In [None]:
final1 = xgb_test_pred/20

In [None]:
sub['target'] = final1

sub.to_csv('XgbSubmissio.csv', index=False)
sub.head()