In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import joblib
import logging
import datetime
import time

In [2]:
class DataHandler:
    """
        Get data from csv
    """
    def __init__(self):
        self.data = None
    def get_data(self):
        print(" - - - fetch data: - - - ")
        self.data = pd.read_csv('../earthquakes.csv') 
        print( " - - - data loaded - - - \nFiles : earthquakes {}".format(self.data.shape))
    def get_process_data(self):
        self.get_data()
        print(" - - - data processed - - - ")

In [3]:
data = DataHandler()
data.get_process_data()

 - - - fetch data: - - - 
 - - - data loaded - - - 
Files : earthquakes (23412, 21)
 - - - data processed - - - 


In [4]:
class FeatureRecipe(DataHandler):
    """
    Feature processing class
    """
    def __init__(self, data: pd.DataFrame):
        self.data = data
        self.category = None
        self.discrete_variable = None
        self.continous_variable = None
        
    #Useless feature
    def drop_useless(self):
        """
        Drop useless column
        """

        def drop_specific_col(self):
            dropped_sepcific_col = []
            dropped_sepcific_col.append('ID')
            dropped_sepcific_col.append('Location Source')
            dropped_sepcific_col.append('Magnitude Source')
            dropped_sepcific_col.append('Magnitude Type')
            return dropped_sepcific_col
               
        def drop_nan_col_100(self):
            dropped_nan_col = []
            for (columnName, columnData) in self.data.iteritems(): 
                if(self.data[columnName].isna().all() == True):
                    dropped_nan_col.append(columnName)
            print("{} feature have 100% NaN ".format(len(dropped_nan_col)))
            return dropped_nan_col
            
        def drop_nan_col_25(df:pd.DataFrame, thresold: float):
            bf=[]
            for c in self.data.columns.to_list():
                if self.data[c].isna().sum()/self.data.shape[0] > thresold:
                    bf.append(c)
            print("{} feature have more than {} NaN ".format(len(bf),thresold))
            print('\n\n - - - features - - -  \n {}'.format(bf))
            return bf
                
        self.data = self.data.drop(drop_specific_col(self), axis=1)
        self.data = self.data.drop(drop_nan_col_100(self), axis=1)
        self.data = self.data.drop(drop_nan_col_25(self, 0.25), axis=1)
        print(self.data)
        print("- - - drop useless columns - - - ")
        
    def convert_timestamp(self):
        """
        Convert date to timestamp
        """
        timestamp = []
        for d, t in zip(self.data['Date'], self.data['Time']):
            try:
                ts = datetime.datetime.strptime(d+' '+t, '%m/%d/%Y %H:%M:%S')
                timestamp.append(time.mktime(ts.timetuple()))
            except ValueError:
                timestamp.append('ValueError')

        timeStamp = pd.Series(timestamp)
        self.data['Timestamp'] = timeStamp.values

        self.data = self.data.drop(['Date', 'Time'], axis=1)
        self.data = self.data[self.data.Timestamp != 'ValueError']
        print(self.data.head())
        print("- - - convert timestamp ---")
        
    def encode_categorical_variable(self):
        """
        Convert categoricals variables to numerics variables
        """
        le = preprocessing.LabelEncoder()
        le.fit(self.data['Type'])
        self.data['Type'] = le.transform(self.data['Type'])

        le.fit(self.data['Source'])
        self.data['Source'] = le.transform(self.data['Source'])

        le.fit(self.data['Status'])
        self.data['Status'] = le.transform(self.data['Status'])
        print(self.data)
        print('- - - encoding variables - - -')
        
    def prepare_data(self):
        """
        Wrap code above
        """
        self.drop_useless()
        self.convert_timestamp()
        self.encode_categorical_variable()
        print("- - - data processed - - -")

In [5]:
recipe = FeatureRecipe(data.data)
recipe.prepare_data()

0 feature have 100% NaN 
8 feature have more than 0.25 NaN 


 - - - features - - -  
 ['Depth Error', 'Depth Seismic Stations', 'Magnitude Error', 'Magnitude Seismic Stations', 'Azimuthal Gap', 'Horizontal Distance', 'Horizontal Error', 'Root Mean Square']
             Date      Time  Latitude  Longitude        Type   Depth  \
0      01/02/1965  13:44:18   19.2460   145.6160  Earthquake  131.60   
1      01/04/1965  11:29:49    1.8630   127.3520  Earthquake   80.00   
2      01/05/1965  18:05:58  -20.5790  -173.9720  Earthquake   20.00   
3      01/08/1965  18:49:43  -59.0760   -23.5570  Earthquake   15.00   
4      01/09/1965  13:32:50   11.9380   126.4270  Earthquake   15.00   
...           ...       ...       ...        ...         ...     ...   
23407  12/28/2016  08:22:12   38.3917  -118.8941  Earthquake   12.30   
23408  12/28/2016  09:13:47   38.3777  -118.8957  Earthquake    8.80   
23409  12/28/2016  12:38:51   36.9179   140.4262  Earthquake   10.00   
23410  12/29/2016  22:

In [6]:
class FeatureExtractor:
    """
    Feature Extractor class
    """    
    def __init__(self, data: pd.DataFrame):
        """
            Input : pandas.DataFrame
            Output : X_train, X_test, y_train, y_test according to sklearn.model_selection.train_test_split
        """
        
        self.data = data
        self.X_train, self.X_test, self.y_train, self.y_test = None, None, None, None
    
    def extract(self):
        """
            drop useless column and set x and y
        """
        x = self.data[["Latitude", "Longitude", "Timestamp", "Source", "Status", "Type"]]
        y= self.data[["Magnitude", "Depth"]]
        return x, y
    
    def split(self, size: float):
        """
            train test split
        """
        x, y = self.extract()
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(x, y, random_state=42,test_size=size)
        return self.X_train, self.X_test, self.y_train, self.y_test

In [7]:
Fextractor = FeatureExtractor(recipe.data)
Fextractor.split(0.1)

(       Latitude  Longitude    Timestamp  Source  Status  Type
 20003   -37.551    -73.465  1.26848e+09      11       1     0
 18133    46.718    153.300  1.16363e+09      11       1     0
 4158      3.745    128.131  2.17446e+08      11       1     0
 18586   -14.177    -76.093  1.18733e+09      11       1     0
 8248    -28.344   -176.488  5.32039e+08      11       1     0
 ...         ...        ...          ...     ...     ...   ...
 11966   -10.233    113.569  7.71542e+08      11       1     0
 21578    55.435   -135.018  1.35963e+09      11       1     0
 5391     -6.495    129.367   3.1497e+08      11       1     0
 860      -5.469    153.269 -5.88747e+07       4       0     0
 15797    17.233   -101.250  1.01915e+09      11       1     0
 
 [21068 rows x 6 columns],
        Latitude  Longitude    Timestamp  Source  Status  Type
 13121   24.6020   122.2820  8.38672e+08      11       1     0
 17921   -0.3900   123.1950  1.15118e+09      11       1     0
 3419   -23.0220  -175.112

In [8]:
class ModelBuilder:
    """
        Class for train and print results of ml model 
    """
    def __init__(self, model_path: str = None, save: bool = None):
        self.model = None
        
    def train(self, X, Y):
        self.model = RandomForestRegressor().fit(X, Y)
        
    def predict_test(self, X) -> np.ndarray:
        return self.model.predict(X)
    
    def save_model(self, path:str):
        joblib.dump((self.model), '{}model.joblib'.format(path))
        pass
                    
    def print_accuracy(self, X, Y):
        return self.model.score(X, Y)
        pass
    
    def load_model(self):
        try:
            joblib.load()
            pass
        except:
            pass


In [9]:
X_train, X_test, y_train, y_test = Fextractor.split(0.1)
m = ModelBuilder() 
m.train(X_train, y_train)
m.print_accuracy(X_test, y_test)
m.predict_test(X_test)
m.save_model('/home/lilian/project_cloud_computing/ml/')