In [None]:
!pip install ProgressBar
!pip install lifelines

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import glob 
from progressbar import ProgressBar
import lifelines
import os

In [None]:
train_csv = pd.read_csv('../input/predict-volcanic-eruptions-ingv-oe/train.csv')

In [None]:
def prepare(name):
    index = []
    frag = glob.glob("../input/predict-volcanic-eruptions-ingv-oe/{}/*".format(name))
    df=pd.DataFrame()

    pbar = ProgressBar()
    for i in pbar(frag):
        df = np.append(df,pd.read_csv(i).mean())
    
    df = pd.DataFrame(df.reshape(len(frag),10))  

    for i in range(0,len(frag)):
        index = np.append(index,os.path.splitext(frag[i].split('{}/'.format(name))[1])[0])
        
    df['segment_id']=index
    df['segment_id']=df['segment_id'].astype(int)
    if name == 'train': 
        df = pd.merge(df, train_csv, on =['segment_id'],how='left') 
    return(df)

In [None]:
train_means = prepare('train')
test_means = prepare('test')

In [None]:
train_means['time_to_eruption'] = train_means['time_to_eruption'] // 6000
train_means.head(10)

In [None]:
test_means.head(10)

In [None]:
def remove_na(dataset):
    for i in range(0,10):
        dataset[i]= dataset[i].fillna(np.mean(dataset[i]))
    
    for i in dataset.columns:
        print(sum(dataset[i].isnull()))
        
    return(dataset)

In [None]:
train_means=remove_na(train_means)
test_means=remove_na(test_means)

In [None]:
y = train_means['segment_id']
x = train_means.drop(columns=['segment_id','time_to_eruption'])

In [None]:
from sklearn import linear_model
from sklearn.metrics import mean_absolute_error as mae
from  sklearn.tree import DecisionTreeClassifier
from  sklearn.model_selection import train_test_split
import lightgbm as lgb
import xgboost as xgb


class estimators:
    
    pred = [] 
    
    def __init__(self):   
        pass
    
    
    
    def lin_reg(self,x,y,test):
        del self.pred[:]
        Xt, Xv, Yt, Yv = train_test_split(x, y, test_size =0.2, shuffle=False)
        reg = linear_model.LinearRegression()
        reg.fit(Xt,Yt)
        print('mean abosulte error: ', mae(reg.predict(Xv),Yv))
        prediction = reg.predict(test)
        self.pred = np.append(self.pred,prediction)
    
    
    
    def extreme(self,x,y,test):
        del self.pred[:]
        model = xgb.XGBRegressor(n_estimators=100000,max_depth=8,learning_rate=0.05,alpha=0.1,SUBSAMPLE=0.6) #tree_method='gpu_hist'
        Xt, Xv, Yt, Yv = train_test_split(x, y, test_size =0.2, shuffle=False)
        eval_set = [(Xv,Yv)]
        model.fit(Xt, Yt,early_stopping_rounds=10,eval_metric='mae', eval_set=eval_set, verbose=False)
        prediction = model.predict(test)
        self.pred = np.append(self.pred,prediction)
 
           
     
    def lgb(self,x,y,test):
        del self.pred[:]
        Xt, Xv, Yt, Yv = train_test_split(x, y, test_size =0.2, shuffle=False)

        params = {
        'objective': 'regression', #specify how is the dependet variable, binary can be used for logistic regression or log loss classification
        'max_bin': 600, #max number of bins that features values will be bucketed in. Small number may reduce training accuracy but may increase general power
        'learning_rate': 0.02, #learning_rate refers to the step size at each interation while moving toward an optimal point
        'num_leaves': 80, # maximum number of leaves in a tree, where a leave is a final termination of a tree
        'metric' : 'mae'
        }


        lgb_train = lgb.Dataset(Xt, Yt)
        lgb_eval = lgb.Dataset(Xv, Yv, reference=lgb_train)
        #lightgbm need to take as argument lightgbm dataset, it is required to make this trasformation

        model = lgb.train(
            params, lgb_train, #it is required to insert the parameters, then the train set
            valid_sets=[lgb_train, lgb_eval],
            verbose_eval=0,
            num_boost_round=1500, # number of boosting iterations 
            early_stopping_rounds=1000 # will stop training if one metric of one validation data doesn’t improve in last early_stopping_round rounds, so if 
            #  for ten 'epochs' the model will stop, in this way the num_boost_round is a maximum value.  
        )  

        y_pred = model.predict(Xv)
        y_true = np.array(Yv)
        print('mean absolute error:',mae(y_true, y_pred))

        prediction = model.predict(test)
        self.pred = np.append(self.pred,prediction)
 
    
    
    def sub(self,test):
        df = pd.DataFrame(test['segment_id'])
        df = pd.concat([df,pd.Series(self.pred)],axis=1)
        sample_submission=pd.read_csv('../input/predict-volcanic-eruptions-ingv-oe/sample_submission.csv')
        
        sample_submission = pd.merge(sample_submission,df, on =['segment_id'])
        sample_submission = sample_submission.drop(columns=['time_to_eruption'])
        sample_submission.columns = ['segment_id', 'time_to_eruption']
        sample_submission.to_csv('sample_submission.csv', header=True, index=False)
        print('saved')
        return(sample_submission)

In [None]:
est = estimators()
est.lin_reg(x,y,test_means.iloc[:,0:10])
est.sub(test_means)

In [None]:
est = estimators()
est.extreme(x,y,test_means.iloc[:,0:10])
est.sub(test_means)

In [None]:
est = estimators()
est.lgb(x,y,test_means.iloc[:,0:10])
est.sub(test_means)