In [1]:
import os, boto3
import sagemaker
from sagemaker import get_execution_role

sagemaker_session = sagemaker.Session()
role = get_execution_role()
bucket_name = 'kaggle-vulcano'
s3 = boto3.resource('s3')
s3.Bucket(bucket_name).download_file('predict-volcanic-eruptions-ingv-oe.zip', 'predict-volcanic-eruptions-ingv-oe.zip')

In [2]:
import zipfile
zipfile.ZipFile('predict-volcanic-eruptions-ingv-oe.zip').extractall('.')

In [3]:
!pip install -U pip
!pip install seaborn
!pip install scikit-learn

Collecting pip
  Using cached pip-20.3.3-py2.py3-none-any.whl (1.5 MB)
Installing collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 20.3
    Uninstalling pip-20.3:
      Successfully uninstalled pip-20.3
Successfully installed pip-20.3.3


In [4]:
import pandas as pd
import seaborn as sns
import numpy as np
import os
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest,f_regression
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor

In [5]:
def make_data(data):
    n = 0
    for  i in data.segment_id :
        file = pd.read_csv('train/{}.csv'.format(i))
        for x in file.columns:
            data.loc[n:n+1,x+'_mean'] = file[x].mean()
            data.loc[n,x+'_std'] = file[x].std()
            data.loc[n,x+'_min'] = file[x].min()
            data.loc[n,x+'_20'] = file[x].quantile(0.20)
            data.loc[n,x+'_40'] = file[x].quantile(0.40)
            data.loc[n,x+'_50'] = file[x].quantile(0.50)
            data.loc[n,x+'_60'] = file[x].quantile(0.60)
            data.loc[n,x+'_80'] = file[x].quantile(0.80)
            data.loc[n,x+'_max'] = file[x].max()
        n+=1
    data.fillna(0,inplace=True)
    return data

In [6]:
train = pd.read_csv('train.csv')
print(train)

data = train.copy()
make_data(data)
print(data)

data_train = data.copy()
y_train = data_train['time_to_eruption']
segment_id = data_train['segment_id']
data_train.drop(['time_to_eruption','segment_id'],axis = 1 , inplace=True)

cols = []
for i in data_train.columns :
    if data_train[i].min() == data_train[i].max():
        cols.append(i)
print(cols)

data_train.drop(['sensor_1_50', 'sensor_2_50', 'sensor_3_50','sensor_4_50', 'sensor_5_50', 'sensor_6_50', 'sensor_7_50', 'sensor_8_50', 'sensor_9_50', 'sensor_10_50'], axis = 1 , inplace=True)
print(data_train.shape)

std_scaler = StandardScaler()
std_data = std_scaler.fit_transform(data_train)
data_train = pd.DataFrame(std_data, columns = data_train.columns)
print(data_train)

      segment_id  time_to_eruption
0     1136037770          12262005
1     1969647810          32739612
2     1895879680          14965999
3     2068207140          26469720
4      192955606          31072429
...          ...               ...
4426   873340274          15695097
4427  1297437712          35659379
4428   694853998          31206935
4429  1886987043           9598270
4430  1100632800          20128938

[4431 rows x 2 columns]
      segment_id  time_to_eruption  sensor_1_mean  sensor_1_std  sensor_1_min  \
0     1136037770          12262005      -1.610323    303.096099       -1421.0   
1     1969647810          32739612       1.426126    438.360560       -2005.0   
2     1895879680          14965999       2.504592    241.575415       -1115.0   
3     2068207140          26469720       2.165797    221.967825       -1025.0   
4      192955606          31072429       0.073815    261.695935       -1190.0   
...          ...               ...            ...           ...      

In [7]:
reg_model = LinearRegression()
reg_model.fit(data_train,y_train)
print(reg_model.score(data_train,y_train))

0.21786763258042774


In [8]:
forest_model = RandomForestRegressor()
forest_model.fit(data_train,y_train)
print(forest_model.score(data_train,y_train))

0.971722107757283


In [9]:
gbr_model = GradientBoostingRegressor()
gbr_model.fit(data_train,y_train)
print(gbr_model.score(data_train,y_train))

0.7149113902004618


In [10]:
test = pd.read_csv('sample_submission.csv')
print(test)

      segment_id  time_to_eruption
0     1000213997                 0
1      100023368                 0
2     1000488999                 0
3     1001028887                 0
4     1001857862                 0
...          ...               ...
4515   996704281                 0
4516   997630809                 0
4517   998072137                 0
4518   998136924                 0
4519    99975682                 0

[4520 rows x 2 columns]


In [11]:
def make_data_test(test):
    n = 0
    for  i in test.segment_id :
        file = pd.read_csv('test/{}.csv'.format(i))
        for x in file.columns:
            test.loc[n:n+1,x+'_mean'] = file[x].mean()
            test.loc[n,x+'_std'] = file[x].std()
            test.loc[n,x+'_min'] = file[x].min()
            test.loc[n,x+'_20'] = file[x].quantile(0.20)
            test.loc[n,x+'_40'] = file[x].quantile(0.40)
            test.loc[n,x+'_50'] = file[x].quantile(0.50)
            test.loc[n,x+'_60'] = file[x].quantile(0.60)
            test.loc[n,x+'_80'] = file[x].quantile(0.80)
            test.loc[n,x+'_max'] = file[x].max()
        n +=1
    test.fillna(0,inplace=True)
    return test

In [12]:
make_data_test(test)   
print(test)

      segment_id  time_to_eruption  sensor_1_mean  sensor_1_std  sensor_1_min  \
0     1000213997                 0       4.462476    306.174474       -1319.0   
1      100023368                 0       0.838103    335.591100       -1268.0   
2     1000488999                 0       0.794903    352.172248       -3806.0   
3     1001028887                 0       1.953717    259.030122       -1156.0   
4     1001857862                 0      -3.984584    254.306426       -1111.0   
...          ...               ...            ...           ...           ...   
4515   996704281                 0    -135.674672  29486.243321      -32767.0   
4516   997630809                 0      -2.666439    294.261065       -1291.0   
4517   998072137                 0       1.192280    371.072752       -3032.0   
4518   998136924                 0       3.218563    346.656999       -1423.0   
4519    99975682                 0      -2.715638    277.119322       -1158.0   

      sensor_1_20  sensor_1

In [13]:
segment_id_test = test['segment_id']
test.drop(['time_to_eruption','segment_id'],axis = 1 , inplace=True)
test.drop(['sensor_1_50', 'sensor_2_50', 'sensor_3_50','sensor_4_50', 'sensor_5_50', 'sensor_6_50', 'sensor_7_50', 'sensor_8_50', 'sensor_9_50', 'sensor_10_50'], axis = 1, inplace=True)
std_test = std_scaler.transform(test)
test = pd.DataFrame(std_test, columns = test.columns)
print(test)

      sensor_1_mean  sensor_1_std  sensor_1_min  sensor_1_20  sensor_1_40  \
0          0.278462     -0.227990      0.378452      0.21096     0.212498   
1          0.076090     -0.208588      0.389010      0.17785     0.181237   
2          0.073678     -0.197652     -0.136376      0.21173     0.215103   
3          0.138382     -0.259085      0.412194      0.23252     0.228129   
4         -0.193190     -0.262201      0.421510      0.23329     0.238549   
...             ...           ...           ...          ...          ...   
4515      -7.546267     19.018172     -6.131529    -24.83186   -62.594059   
4516      -0.119590     -0.235848      0.384248      0.20634     0.204683   
4517       0.095866     -0.185186      0.023848      0.17939     0.186447   
4518       0.209006     -0.201290      0.356923      0.18478     0.189052   
4519      -0.122337     -0.247154      0.411780      0.21943     0.215103   

      sensor_1_60  sensor_1_80  sensor_1_max  sensor_2_mean  sensor_2_std  

In [14]:
reg_y_pred = reg_model.predict(test)
sub = pd.DataFrame()
sub['segment_id'] = segment_id_test
sub['time_to_eruption'] = reg_y_pred
print(sub)

      segment_id  time_to_eruption
0     1000213997      2.313018e+07
1      100023368      2.726924e+07
2     1000488999      2.623266e+07
3     1001028887      2.845469e+07
4     1001857862      2.245029e+07
...          ...               ...
4515   996704281     -1.131163e+09
4516   997630809      2.466812e+07
4517   998072137      2.634089e+07
4518   998136924      2.714895e+07
4519    99975682      2.375390e+07

[4520 rows x 2 columns]


In [15]:
forest_y_pred = forest_model.predict(test)
sub = pd.DataFrame()
sub['segment_id'] = segment_id_test
sub['time_to_eruption'] = forest_y_pred
print(sub)

      segment_id  time_to_eruption
0     1000213997       22893169.79
1      100023368       36351686.59
2     1000488999       29578676.01
3     1001028887       22685492.60
4     1001857862       18624499.82
...          ...               ...
4515   996704281       28395614.40
4516   997630809       16500477.39
4517   998072137       15544886.67
4518   998136924       33646569.52
4519    99975682       29338832.89

[4520 rows x 2 columns]


In [16]:
gbr_y_pred = gbr_model.predict(test)
sub = pd.DataFrame()
sub['segment_id'] = segment_id_test
sub['time_to_eruption'] = gbr_y_pred
print(sub)

      segment_id  time_to_eruption
0     1000213997      2.189381e+07
1      100023368      2.421971e+07
2     1000488999      3.132068e+07
3     1001028887      2.515577e+07
4     1001857862      2.107951e+07
...          ...               ...
4515   996704281      1.994196e+07
4516   997630809      1.942544e+07
4517   998072137      1.713065e+07
4518   998136924      3.048425e+07
4519    99975682      2.301891e+07

[4520 rows x 2 columns]


In [17]:
!pip install tensorflow
!pip install tf-nightly

Collecting tensorflow
  Downloading tensorflow-2.4.0-cp36-cp36m-manylinux2010_x86_64.whl (394.7 MB)
[K     |███████████████████████▌        | 290.5 MB 126.3 MB/s eta 0:00:01    |████████████                    | 149.0 MB 68.8 MB/s eta 0:00:04

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[K     |████████████████████████████████| 394.7 MB 114.2 MB/s eta 0:00:01[K     |████████████████████████████████| 394.7 MB 7.1 kB/s 
Collecting gast==0.3.3
  Downloading gast-0.3.3-py2.py3-none-any.whl (9.7 kB)
Collecting absl-py~=0.10
  Downloading absl_py-0.11.0-py3-none-any.whl (127 kB)
[K     |████████████████████████████████| 127 kB 108.3 MB/s eta 0:00:01
[?25hCollecting astunparse~=1.6.3
  Downloading astunparse-1.6.3-py2.py3-none-any.whl (12 kB)
Collecting flatbuffers~=1.12.0
  Downloading flatbuffers-1.12-py2.py3-none-any.whl (15 kB)
Collecting grpcio~=1.32.0
  Downloading grpcio-1.32.0-cp36-cp36m-manylinux2014_x86_64.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 76.8 MB/s eta 0:00:01
[?25hCollecting keras-preprocessing~=1.1.2
  Downloading Keras_Preprocessing-1.1.2-py2.py3-none-any.whl (42 kB)
[K     |████████████████████████████████| 42 kB 2.6 MB/s  eta 0:00:01
[?25hCollecting opt-einsum~=3.3.0
  Downloading opt_einsum-3.3.0-py3-none-any.whl (65 kB)


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[K     |████████████████████████████████| 400.4 MB 114.6 MB/s eta 0:00:01[K     |████████████████████████████████| 400.4 MB 3.8 kB/s 
Collecting gast==0.4.0
  Downloading gast-0.4.0-py3-none-any.whl (9.8 kB)
Collecting grpcio~=1.34.0
  Downloading grpcio-1.34.1-cp36-cp36m-manylinux2014_x86_64.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 61.7 MB/s eta 0:00:01
[?25hCollecting h5py~=3.1.0
  Downloading h5py-3.1.0-cp36-cp36m-manylinux1_x86_64.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 87.7 MB/s eta 0:00:01
Collecting tb-nightly~=2.5.0.a
  Downloading tb_nightly-2.5.0a20210119-py3-none-any.whl (12.2 MB)
[K     |████████████████████████████████| 12.2 MB 70.9 MB/s eta 0:00:01
Collecting tf-estimator-nightly~=2.5.0.dev
  Downloading tf_estimator_nightly-2.5.0.dev2021012001-py2.py3-none-any.whl (462 kB)
[K     |████████████████████████████████| 462 kB 87.8 MB/s eta 0:00:01
Installing collected packages: grpcio, tf-estimator-nightly, tb-nightly, h5py, g

In [36]:
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Activation, Flatten, ZeroPadding2D, Conv2D, MaxPool2D
from tensorflow.keras.optimizers import RMSprop, Adam

In [42]:
VULCAN_INPUT_SIZE = (8,10,1)
model = Sequential([    
    Flatten(input_shape=VULCAN_INPUT_SIZE),
    Dense(64, activation='relu'),
    Dense(128, activation='relu'),
    Dense(128, activation='relu'),
    Dense(256, activation='relu'),
    Dense(256, activation='relu'),       
    Dense(1),
    Activation('softmax')
    ])
model.compile(optimizer=Adam(), loss='mse', metrics=['accuracy', 'mae'])

In [43]:
history = model.fit(data_train, y_train, epochs=100, validation_split=0.1, verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78