### Importing Libraries and Loading Data

In [1]:
import pandas as  pd 
import numpy as np 
import matplotlib.pyplot as plt 
from  datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# load data 
train=pd.read_csv("data/train.csv")
sample_sub=pd.read_csv("data/Sample_sub.csv")
df=pd.read_csv("data/train.csv")

### Data Preprocessing

A very helpful Notebook on Data Preprocessing and data creation was found on the Discussion page of the competition
by Mohamed_Salam_Jedidi
https://drive.google.com/file/d/1N71TublXPk8c8ZAWAVHTGACSa7uLJcB4/view?usp=sharing

In [2]:
sample_sub.head()

Unnamed: 0,ID,Capture_Number
0,CaptureSite_0_201901,7
1,CaptureSite_0_201902,1
2,CaptureSite_0_201903,5
3,CaptureSite_0_201904,2
4,CaptureSite_0_201905,3


In [3]:
# split Id to Year woy and CaptureSite
sample_sub["year_woy"]=(sample_sub.ID.apply(lambda x: x.split("_")[-1])).astype(int)
sample_sub["CaptureSite"]=sample_sub.ID.apply(lambda x: ("_").join(x.split("_")[0:-1]))

In [4]:
# Create Time features from Date_TimeCaught 
train["Date_TimeCaught"]=pd.to_datetime(train["Date_TimeCaught"])
print(train.Date_TimeCaught.min(),train.Date_TimeCaught.max())
train["year"]=train.Date_TimeCaught.dt.year
train["week_of_year"]=train.Date_TimeCaught.dt.weekofyear
train["year_woy"]=train.year*100+train.week_of_year

1998-04-17 00:00:00 2018-12-31 00:00:00


In [5]:
keys=pd.concat([train[["year_woy","CaptureSite"]],sample_sub[["year_woy","CaptureSite"]]])
# keys.sort_values(["CaptureSite","year_woy"],inplace=True)
CaptureSite_min_year_woy=keys.groupby("CaptureSite").year_woy.min().rename("year_woy").reset_index()
CaptureSite_min_year_woy.head()

Unnamed: 0,CaptureSite,year_woy
0,CaptureSite_0,200051
1,CaptureSite_1,200123
2,CaptureSite_10,200050
3,CaptureSite_11,199828
4,CaptureSite_12,200107


In [6]:
range_year_woy=pd.DataFrame()
range_year_woy["Date_TimeCaught"]=pd.date_range(start=train.Date_TimeCaught.min(),end=datetime(2019,10,31))
range_year_woy["year"]=range_year_woy.Date_TimeCaught.dt.year
range_year_woy["week_of_year"]=range_year_woy.Date_TimeCaught.dt.weekofyear
range_year_woy["year_woy"]=range_year_woy.year*100+range_year_woy.week_of_year
range_year_woy.drop_duplicates(["year_woy"],inplace=True)

In [7]:
final_data=[]
for site , year_woy in zip(CaptureSite_min_year_woy.CaptureSite.values,CaptureSite_min_year_woy.year_woy.values) :

    one_site_df=range_year_woy[range_year_woy.year_woy>=year_woy]
    one_site_df["CaptureSite"]=site
    final_data.append(one_site_df)
final_data=pd.concat(final_data)
len(final_data)

28048

In [8]:
Target=train.groupby(["year_woy","CaptureSite"]).CaptureSite.count().rename("Capture_Number").reset_index()
final_data=final_data.merge(Target,on=["year_woy","CaptureSite"],how="left")
final_data.Capture_Number.fillna(0,inplace=True)

In [9]:
final_data.Capture_Number.fillna(0).value_counts(True)

0.0     0.716308
1.0     0.147461
2.0     0.059469
3.0     0.030626
4.0     0.015759
5.0     0.009199
6.0     0.005990
7.0     0.005455
8.0     0.002603
9.0     0.002104
10.0    0.001604
11.0    0.000998
12.0    0.000606
14.0    0.000535
13.0    0.000499
15.0    0.000214
16.0    0.000178
17.0    0.000107
28.0    0.000036
22.0    0.000036
19.0    0.000036
35.0    0.000036
20.0    0.000036
21.0    0.000036
18.0    0.000036
23.0    0.000036
Name: Capture_Number, dtype: float64

In [10]:
train=final_data[final_data.year<2019]
test=final_data[final_data.year==2019]

In [11]:
filter_  = train['year']>2010

In [12]:
#SELECTING DATA AFTER 2010
train=train[filter_]
CaptureSite_train = train.CaptureSite
CaptureSite_test = test.CaptureSite

In [13]:
train = pd.get_dummies(columns = ['CaptureSite'], data = train)
test = pd.get_dummies(columns = ['CaptureSite'], data = test)

### Feature Creation

In [14]:
#finding the average capture for each Capture Site
train['CaptureSite'] = CaptureSite_train
test['CaptureSite'] = CaptureSite_test
mean_capture=train.groupby('CaptureSite').mean()[['Capture_Number']].reset_index()
mean_capture.columns=['CaptureSite','mean']
train=pd.merge(train,mean_capture,on=['CaptureSite'],how='left')
test=pd.merge(test,mean_capture,on=['CaptureSite'],how='left')

#Count of researchers and Species in Capture Site
feat=df[['Researcher','Species','CaptureSite' ]].groupby('CaptureSite').nunique()
feat.drop(columns='CaptureSite',axis=1,inplace=True)
feat=feat.reset_index()

train['period_of_year']=train['week_of_year']/9
test['period_of_year']=test['week_of_year']/9

In [15]:
test=pd.merge(test,feat,on=['CaptureSite'],how='left')
train=pd.merge(train,feat,on=['CaptureSite'],how='left')

In [16]:
test.drop(columns=['Date_TimeCaught','Capture_Number','CaptureSite'],inplace=True)
train.drop(columns=['Date_TimeCaught','CaptureSite'],inplace=True)

### Modeling

In [17]:
from catboost import CatBoostRegressor
cat_model = CatBoostRegressor()
X = train.drop(columns='Capture_Number',axis=1)
y = train['Capture_Number']

In [18]:
cat_model.fit(X,y)
cat_pred=cat_model.predict(test)

0:	learn: 1.8668207	total: 138ms	remaining: 2m 18s
1:	learn: 1.8469439	total: 210ms	remaining: 1m 45s
2:	learn: 1.8290132	total: 263ms	remaining: 1m 27s
3:	learn: 1.8111186	total: 341ms	remaining: 1m 24s
4:	learn: 1.7940351	total: 442ms	remaining: 1m 28s
5:	learn: 1.7785221	total: 492ms	remaining: 1m 21s
6:	learn: 1.7634332	total: 559ms	remaining: 1m 19s
7:	learn: 1.7484825	total: 643ms	remaining: 1m 19s
8:	learn: 1.7343057	total: 711ms	remaining: 1m 18s
9:	learn: 1.7204707	total: 791ms	remaining: 1m 18s
10:	learn: 1.7082952	total: 883ms	remaining: 1m 19s
11:	learn: 1.6962787	total: 992ms	remaining: 1m 21s
12:	learn: 1.6848537	total: 1.08s	remaining: 1m 21s
13:	learn: 1.6735079	total: 1.15s	remaining: 1m 21s
14:	learn: 1.6629987	total: 1.22s	remaining: 1m 19s
15:	learn: 1.6534778	total: 1.29s	remaining: 1m 19s
16:	learn: 1.6441358	total: 1.34s	remaining: 1m 17s
17:	learn: 1.6347600	total: 1.4s	remaining: 1m 16s
18:	learn: 1.6265940	total: 1.46s	remaining: 1m 15s
19:	learn: 1.6188100	to

161:	learn: 1.4084981	total: 7.96s	remaining: 41.2s
162:	learn: 1.4082476	total: 8s	remaining: 41.1s
163:	learn: 1.4080903	total: 8.02s	remaining: 40.9s
164:	learn: 1.4078167	total: 8.06s	remaining: 40.8s
165:	learn: 1.4074995	total: 8.1s	remaining: 40.7s
166:	learn: 1.4069507	total: 8.13s	remaining: 40.6s
167:	learn: 1.4066506	total: 8.17s	remaining: 40.4s
168:	learn: 1.4064011	total: 8.2s	remaining: 40.3s
169:	learn: 1.4058319	total: 8.23s	remaining: 40.2s
170:	learn: 1.4056001	total: 8.26s	remaining: 40.1s
171:	learn: 1.4050082	total: 8.3s	remaining: 39.9s
172:	learn: 1.4047266	total: 8.33s	remaining: 39.8s
173:	learn: 1.4044832	total: 8.37s	remaining: 39.7s
174:	learn: 1.4039789	total: 8.4s	remaining: 39.6s
175:	learn: 1.4037660	total: 8.43s	remaining: 39.5s
176:	learn: 1.4036500	total: 8.47s	remaining: 39.4s
177:	learn: 1.4033044	total: 8.51s	remaining: 39.3s
178:	learn: 1.4030358	total: 8.54s	remaining: 39.2s
179:	learn: 1.4026986	total: 8.57s	remaining: 39.1s
180:	learn: 1.40213

320:	learn: 1.3662236	total: 13.3s	remaining: 28.1s
321:	learn: 1.3661187	total: 13.3s	remaining: 28.1s
322:	learn: 1.3658143	total: 13.4s	remaining: 28s
323:	learn: 1.3653935	total: 13.4s	remaining: 27.9s
324:	learn: 1.3652378	total: 13.4s	remaining: 27.9s
325:	learn: 1.3648158	total: 13.4s	remaining: 27.8s
326:	learn: 1.3645720	total: 13.5s	remaining: 27.7s
327:	learn: 1.3644023	total: 13.5s	remaining: 27.8s
328:	learn: 1.3642898	total: 13.6s	remaining: 27.8s
329:	learn: 1.3639952	total: 13.7s	remaining: 27.7s
330:	learn: 1.3639112	total: 13.7s	remaining: 27.7s
331:	learn: 1.3638143	total: 13.8s	remaining: 27.7s
332:	learn: 1.3636057	total: 13.8s	remaining: 27.7s
333:	learn: 1.3633037	total: 13.9s	remaining: 27.7s
334:	learn: 1.3629520	total: 13.9s	remaining: 27.6s
335:	learn: 1.3627557	total: 13.9s	remaining: 27.6s
336:	learn: 1.3625806	total: 14s	remaining: 27.5s
337:	learn: 1.3623733	total: 14s	remaining: 27.4s
338:	learn: 1.3621508	total: 14s	remaining: 27.4s
339:	learn: 1.361824

484:	learn: 1.3390691	total: 18.1s	remaining: 19.2s
485:	learn: 1.3389683	total: 18.1s	remaining: 19.2s
486:	learn: 1.3388873	total: 18.2s	remaining: 19.1s
487:	learn: 1.3388232	total: 18.2s	remaining: 19.1s
488:	learn: 1.3387217	total: 18.2s	remaining: 19s
489:	learn: 1.3385644	total: 18.2s	remaining: 19s
490:	learn: 1.3383492	total: 18.3s	remaining: 18.9s
491:	learn: 1.3382617	total: 18.3s	remaining: 18.9s
492:	learn: 1.3382436	total: 18.3s	remaining: 18.8s
493:	learn: 1.3380916	total: 18.3s	remaining: 18.8s
494:	learn: 1.3379995	total: 18.4s	remaining: 18.7s
495:	learn: 1.3379499	total: 18.4s	remaining: 18.7s
496:	learn: 1.3377847	total: 18.4s	remaining: 18.6s
497:	learn: 1.3376715	total: 18.4s	remaining: 18.6s
498:	learn: 1.3374778	total: 18.5s	remaining: 18.5s
499:	learn: 1.3371791	total: 18.5s	remaining: 18.5s
500:	learn: 1.3369898	total: 18.5s	remaining: 18.4s
501:	learn: 1.3368462	total: 18.5s	remaining: 18.4s
502:	learn: 1.3367937	total: 18.6s	remaining: 18.3s
503:	learn: 1.33

649:	learn: 1.3211150	total: 22.4s	remaining: 12.1s
650:	learn: 1.3210433	total: 22.4s	remaining: 12s
651:	learn: 1.3209687	total: 22.5s	remaining: 12s
652:	learn: 1.3209033	total: 22.5s	remaining: 11.9s
653:	learn: 1.3208124	total: 22.5s	remaining: 11.9s
654:	learn: 1.3207289	total: 22.5s	remaining: 11.9s
655:	learn: 1.3205701	total: 22.6s	remaining: 11.8s
656:	learn: 1.3205278	total: 22.6s	remaining: 11.8s
657:	learn: 1.3204481	total: 22.6s	remaining: 11.8s
658:	learn: 1.3204152	total: 22.6s	remaining: 11.7s
659:	learn: 1.3203505	total: 22.7s	remaining: 11.7s
660:	learn: 1.3202469	total: 22.7s	remaining: 11.6s
661:	learn: 1.3202184	total: 22.7s	remaining: 11.6s
662:	learn: 1.3201811	total: 22.7s	remaining: 11.6s
663:	learn: 1.3200355	total: 22.8s	remaining: 11.5s
664:	learn: 1.3199089	total: 22.8s	remaining: 11.5s
665:	learn: 1.3198442	total: 22.8s	remaining: 11.4s
666:	learn: 1.3198126	total: 22.8s	remaining: 11.4s
667:	learn: 1.3197272	total: 22.9s	remaining: 11.4s
668:	learn: 1.31

813:	learn: 1.3069755	total: 26.7s	remaining: 6.11s
814:	learn: 1.3068993	total: 26.8s	remaining: 6.07s
815:	learn: 1.3068632	total: 26.8s	remaining: 6.04s
816:	learn: 1.3068306	total: 26.8s	remaining: 6s
817:	learn: 1.3067529	total: 26.8s	remaining: 5.97s
818:	learn: 1.3067365	total: 26.9s	remaining: 5.93s
819:	learn: 1.3066579	total: 26.9s	remaining: 5.9s
820:	learn: 1.3065351	total: 26.9s	remaining: 5.87s
821:	learn: 1.3064608	total: 26.9s	remaining: 5.83s
822:	learn: 1.3064261	total: 27s	remaining: 5.8s
823:	learn: 1.3063817	total: 27s	remaining: 5.77s
824:	learn: 1.3063699	total: 27s	remaining: 5.73s
825:	learn: 1.3062994	total: 27s	remaining: 5.7s
826:	learn: 1.3062778	total: 27.1s	remaining: 5.66s
827:	learn: 1.3062498	total: 27.1s	remaining: 5.63s
828:	learn: 1.3061882	total: 27.1s	remaining: 5.59s
829:	learn: 1.3061259	total: 27.2s	remaining: 5.56s
830:	learn: 1.3060147	total: 27.2s	remaining: 5.53s
831:	learn: 1.3059188	total: 27.2s	remaining: 5.49s
832:	learn: 1.3058577	tota

973:	learn: 1.2951086	total: 31.1s	remaining: 831ms
974:	learn: 1.2950099	total: 31.2s	remaining: 799ms
975:	learn: 1.2949643	total: 31.2s	remaining: 767ms
976:	learn: 1.2949204	total: 31.2s	remaining: 735ms
977:	learn: 1.2948322	total: 31.2s	remaining: 703ms
978:	learn: 1.2948062	total: 31.3s	remaining: 671ms
979:	learn: 1.2947949	total: 31.3s	remaining: 639ms
980:	learn: 1.2947550	total: 31.3s	remaining: 607ms
981:	learn: 1.2946614	total: 31.3s	remaining: 575ms
982:	learn: 1.2946003	total: 31.4s	remaining: 543ms
983:	learn: 1.2945173	total: 31.4s	remaining: 511ms
984:	learn: 1.2944062	total: 31.4s	remaining: 479ms
985:	learn: 1.2943895	total: 31.5s	remaining: 447ms
986:	learn: 1.2943377	total: 31.5s	remaining: 415ms
987:	learn: 1.2942760	total: 31.5s	remaining: 383ms
988:	learn: 1.2942106	total: 31.5s	remaining: 351ms
989:	learn: 1.2941433	total: 31.6s	remaining: 319ms
990:	learn: 1.2941085	total: 31.6s	remaining: 287ms
991:	learn: 1.2940449	total: 31.6s	remaining: 255ms
992:	learn: 

In [19]:
from lightgbm import LGBMRegressor
model = LGBMRegressor(max_depth=4,
                      learning_rate=0.17,
                      n_estimators=1000,
                      min_child_weight =80,
                      colsample_bytree=0.5,
                      random_state=50,colsample_bylevel=0.6)
model.fit(X,y)
lgb_pred = model.predict(test)

In [20]:
sample_sub = pd.read_csv('data/Sample_sub.csv')
pred = 0.3*lgb_pred + 0.7*cat_pred
sample_sub['Capture_Number']= pred

In [21]:
sample_sub.to_csv('sub5.csv',index=False)