In [2]:
#!/usr/bin/env python
# coding: utf-8
import numpy
import sklearn
import dateutil
import sqlite3
from sqlite3 import Error
from sklearn import preprocessing
import pandas as pd
import pickle
from sklearn.externals import joblib
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
import platform
import os
from tpot import TPOTRegressor
from sklearn.model_selection import train_test_split


class DemandTrain:
    def __init__(self):
        # try:
        #     if platform.system() == 'Windows':
        self.Sql_connection1 = self.create_connection('/opt/apps/scripts/jaarvis_demand_supply/evo.db')

        #     elif platform.system() == 'Linux':
        #         self.Sql_connection1 = self.create_connection('/opt/apps/scripts/jaarvis_demand_supply/evo.db')
        # except Error as e:
        #     print(e)

        self.df = pd.read_sql_query("select * from zones;", self.Sql_connection1)

    def create_connection(self, db_file):
        """ create a database connection to the SQLite database
            specified by the db_file
        :param db_file: database file
        :return: Connection object or None
        """
        try:
            conn = sqlite3.connect(db_file)
            return conn
        except Error as e:
            print(e)
        return None

    def data_cleansing(self):
        self.df = self.df.dropna(how='any')
        self.df.time = self.df.time.astype(str).str[:2].astype(int)
        self.df.time = self.df.time.replace(regex=True, inplace=True, to_replace=r'\D', value=r'')
        self.df.time = [(x or -1) for x in self.df.time]
        self.df.zone = [(x or 'unknown') for x in self.df.zone]
        self.df.year = [(x or 0) for x in self.df.year]
        self.df.month = [(x or 0) for x in self.df.month]
        self.df.week = [(x or 0) for x in self.df.week]
        self.df.day = [(x or 0) for x in self.df.day]
        self.df.booked_vehicles = [(x or 0) for x in self.df.booked_vehicles]
        self.df.free_vehicles = [(x or 0) for x in self.df.free_vehicles]

        self.df = self.df.mask(self.df.eq('None')).dropna()
        self.df = self.df.fillna(0)

        return self.df

    def feature_addition(self):
        # no feature added
        pass

    def feature_elimination(self):
        # no feature deleted
        pass

    def training_data(self):

        self.data_cleansing()
        
        df=self.df
        le = preprocessing.LabelEncoder()
        le = le.fit(df.zone)
        numpy.save('classes.npy', le.classes_)        
        pd.DataFrame({'zone': df['zone'].unique()}).to_csv("zones_list.csv")
        df['zone'] = le.transform(df.zone.get_values())
        X = df[['zone','year', 'month', 'day', 'time']]       
#         X = df[['zone', 'day', 'time']]
        X = X.values
        sc_X = StandardScaler()
        X = sc_X.fit_transform(X)
                
        joblib.dump(sc_X,"scaler.save") 

        demand = df.booked_vehicles.values
        free_vehicles = df.free_vehicles.values

        return X, demand, free_vehicles

    def trainRF(self):
        X, demand, free_vehicles = self.training_data()
        X_train, X_test, y_train, y_test = train_test_split(X, demand, train_size=0.75, test_size=0.25)
        tpot = TPOTRegressor(generations=100, population_size=100,
                         offspring_size=None, mutation_rate=0.9,
                         crossover_rate=0.1,
                         scoring='neg_mean_squared_error', cv=5,
                         subsample=1.0, n_jobs=1,
                         max_time_mins=None, max_eval_time_mins=5,
                         random_state=None, config_dict=None,
                         warm_start=False,
                         memory=None,
                         use_dask=False,
                         periodic_checkpoint_folder=None,
                         early_stop=None,
                         verbosity=0,
                         disable_update_check=False)
        tpot.fit(X_train, y_train)
        print(tpot.score(X_test, y_test))
        tpot.export('tpot_demand_pipeline.py')

TrainingObj = DemandTrain()
TrainingObj.trainRF()



-100.99613904012061


In [4]:
pwd()

'/opt/apps/Jaarvis_Demand_Supply'

In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import ElasticNetCV, LassoLarsCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import PolynomialFeatures
from tpot.builtins import StackingEstimator

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=None)

# Average CV score on the training set was:-113.19816539397762
exported_pipeline = make_pipeline(
    StackingEstimator(estimator=ElasticNetCV(l1_ratio=0.75, tol=0.01)),
    StackingEstimator(estimator=LassoLarsCV(normalize=True)),
    StackingEstimator(estimator=GradientBoostingRegressor(alpha=0.99, learning_rate=0.01, loss="lad", max_depth=6, max_features=0.7500000000000001, min_samples_leaf=18, min_samples_split=13, n_estimators=100, subsample=0.05)),
    PolynomialFeatures(degree=2, include_bias=False, interaction_only=False),
    RandomForestRegressor(bootstrap=True, max_features=0.8500000000000001, min_samples_leaf=18, min_samples_split=11, n_estimators=100)
)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
