# import required libraries

In [None]:
import pandas as  pd 
import numpy as np 
import matplotlib.pyplot as plt 
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from xgboost import XGBRegressor
from IPython.display import FileLink
from datetime import datetime

# Load train and test data and explore them

In [None]:
train=pd.read_csv("./data/train.csv")
category=pd.read_csv('./data/CaptureSite_category.csv')
sample_sub=pd.read_csv("./data/Sample_sub.csv")

In [None]:
sample_sub.head()

In [None]:
train.head()

In [None]:
category.head()

# Data preprocessing and feature engineering

In [None]:
# split Id to Year woy and CaptureSite
sample_sub["year_woy"]=(sample_sub.ID.apply(lambda x: x.split("_")[-1])).astype(int)
sample_sub["CaptureSite"]=sample_sub.ID.apply(lambda x: ("_").join(x.split("_")[0:-1]))

In [None]:
# Create Time features from Date_TimeCaught 
train["Date_TimeCaught"]=pd.to_datetime(train["Date_TimeCaught"])
train["year"]=train.Date_TimeCaught.dt.year
train["week_of_year"]=train.Date_TimeCaught.dt.weekofyear
train["year_woy"]=train.year*100+train.week_of_year

In [None]:
keys=pd.concat([train[["year_woy","CaptureSite"]],sample_sub[["year_woy","CaptureSite"]]])
CaptureSite_min_year_woy=keys.groupby("CaptureSite").year_woy.min().rename("year_woy").reset_index()
CaptureSite_min_year_woy.head()

In [None]:
range_year_woy=pd.DataFrame()
range_year_woy["Date_TimeCaught"]=pd.date_range(start=train.Date_TimeCaught.min(),end=datetime(2019,10,31))
range_year_woy["year"]=range_year_woy.Date_TimeCaught.dt.year
range_year_woy["week_of_year"]=range_year_woy.Date_TimeCaught.dt.weekofyear
range_year_woy["year_woy"]=range_year_woy.year*100+range_year_woy.week_of_year
range_year_woy.drop_duplicates(["year_woy"],inplace=True)

In [None]:
final_data=[]
for site , year_woy in zip(CaptureSite_min_year_woy.CaptureSite.values,CaptureSite_min_year_woy.year_woy.values) :
    one_site_df=range_year_woy[range_year_woy.year_woy>=year_woy]
    one_site_df["CaptureSite"]=site
    final_data.append(one_site_df)
final_data=pd.concat(final_data)

In [None]:
Target=train.groupby(["year_woy","CaptureSite"]).CaptureSite.count().rename("Capture_Number").reset_index()
final_data=final_data.merge(Target,on=["year_woy","CaptureSite"],how="left")
final_data.Capture_Number.fillna(0,inplace=True)

In [None]:
final_data["catching_day"]=final_data.Date_TimeCaught.dt.day
final_data["day_of_week"]=final_data.Date_TimeCaught.dt.weekday
final_data["catching_month"]=final_data.Date_TimeCaught.dt.month

In [None]:
final_data['saison']=np.zeros(final_data.shape[0])
for i in range(train.shape[0]):
    if final_data['catching_month'][i] in ['01','02','12']:
        final_data['saison']= 'hiver'
    elif final_data['catching_month'][i] in ['03','04','05']:
        final_data['saison']='printemps'
    elif final_data['catching_month'][i] in ['06','07','08']:
        final_data['saison']='ete'
    else:
        final_data['saison']='automne'

In [None]:
df=pd.read_csv("./data/train.csv")

In [None]:
df.head()

In [None]:
a = pd.DataFrame(df.groupby('CaptureSite').agg({'Researcher':['count']}))
b = pd.DataFrame(df.groupby('CaptureSite').agg({'Fisher':['count']}))

final_data = final_data.merge(a,on='CaptureSite')
final_data = final_data.merge(b,on='CaptureSite')

In [None]:
c = pd.DataFrame(category.groupby('CaptureSite').agg({'CaptureSiteCategory':['count']}))
d = pd.DataFrame(category.groupby('CaptureSite').agg({'Type':['count']}))

final_data = final_data.merge(category,on='CaptureSite')
final_data = final_data.merge(c,on='CaptureSite')
final_data = final_data.merge(d,on='CaptureSite')

In [None]:
del final_data['Date_TimeCaught']

In [None]:
e = pd.DataFrame(final_data.groupby('saison').agg({'Capture_Number':['count']}))

In [None]:
final_data = final_data.merge(e,on='saison')

In [None]:
final_data.head()

In [None]:
for i in range(len(final_data)):
    final_data.loc[i, 'capturesite_number'] = int(final_data.CaptureSite.values[i].split('_')[1])

In [None]:
final_data['CaptureSite_FE'] = final_data['CaptureSite'].map(final_data['CaptureSite'].value_counts(True))

# Label encoding and splitting data to train and test data

In [None]:
pd.get_dummies(final_data,columns=['week_of_year'])
pd.get_dummies(final_data,columns=['CaptureSite'])
pd.get_dummies(final_data,columns=['saison'])

In [None]:
for f in final_data.columns:
    if final_data[f].dtype=='object'  : 
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(final_data[f].values))  
        final_data[f] = lbl.transform(list(final_data[f].values)) 

In [None]:
train=final_data[final_data.year<2019]

In [None]:
test=final_data[final_data.year==2019]

In [None]:
del test['Capture_Number']

In [None]:
y_train=train['Capture_Number']
X_train=train.drop('Capture_Number',axis=1)

In [None]:
x_train, x_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.33, random_state=18)
lgb_train = lgb.Dataset(data=x_train, label=y_train)
lgb_eval = lgb.Dataset(data=x_val, label=y_val)

In [None]:
final_data.head()

# Preparing and training an lgbm model

In [None]:
params = {'task': 'train', 'boosting_type': 'gbdt', 'objective': 'regression', 'metric': 'rmse', 
          'learning_rate': 0.024, 'num_leaves': 80, 'verbose': 0 ,
          'reg_alpha':0.4, 'reg_lambda':0.4, 'max_depth':-1,'lambda':0.03,
          'feature_fraction':0.4,'bagging_fraction':0.4, 'min_data_in_leaf':120,'n_estimators':1000}
model1 = lgb.train(params, lgb_train, valid_sets=lgb_eval, early_stopping_rounds=80, verbose_eval=80)

In [None]:
model2 = XGBRegressor(
    max_depth=2,
    learning_rate=0.07,
    n_estimators=1000,
    min_child_weight=300, 
    colsample_bytree=1, 
    subsample=0.8, 
    eta=0.3,    
    seed=42,
gamma=10
)
model2.fit(
    x_train, 
    y_train, 
    eval_metric="rmse", 
    eval_set=[(x_train, y_train), (x_val, y_val)], 
    verbose=True, 
    early_stopping_rounds = 10)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, eta=0.3, gamma=10,
       importance_type='gain', learning_rate=0.07, max_delta_step=0,
       max_depth=2, min_child_weight=300, missing=None, n_estimators=1000,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=42, silent=None,
       subsample=0.8, verbosity=1)

# Making predictions using the trained model

In [None]:
test_preds1= model.predict(test)
test_preds2= model2.predict(test)
final_predictions = (test_preds1 + test_preds2)/2

In [None]:
sample_sub['Capture_Number']=final_predictions

# Creating a submission

In [None]:
def create_submission(submission_file, submission_name):
    submission_file.to_csv(submission_name+".csv" , index=False)
    return FileLink(submission_name+".csv")

In [None]:
create_submission(sample_sub, 'hfe16')