In [1]:
# Importing Required Libraries
import numpy as np
import copy
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge, Lasso, LinearRegression
from sklearn.tree import DecisionTreeRegressor  
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import r2_score, accuracy_score, mean_squared_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
import seaborn as sns
import warnings
import pandas as pd
%matplotlib inline
warnings.filterwarnings('ignore')

In [2]:
# Reading the dataframe using pandas
data = pd.read_csv('Football.csv')

## Data Preprocessing

In [3]:
# Imputing the missing value of region record as the most common value
data['region'] = data['region'].fillna(2.)

In [4]:
data.head()

Unnamed: 0,name,club,age,position,position_cat,market_value,page_views,fpl_value,fpl_sel,fpl_points,region,nationality,new_foreign,age_cat,club_id,big_club,new_signing
0,Alexis Sanchez,Arsenal,28,LW,1,65.0,4329,12.0,17.10%,264,3.0,Chile,0,4,1,1,0
1,Mesut Ozil,Arsenal,28,AM,1,50.0,4395,9.5,5.60%,167,2.0,Germany,0,4,1,1,0
2,Petr Cech,Arsenal,35,GK,4,7.0,1529,5.5,5.90%,134,2.0,Czech Republic,0,6,1,1,0
3,Theo Walcott,Arsenal,28,RW,1,20.0,2393,7.5,1.50%,122,1.0,England,0,4,1,1,0
4,Laurent Koscielny,Arsenal,31,CB,3,22.0,912,6.0,0.70%,121,2.0,France,0,4,1,1,0


In [5]:
class DataPreprocessing:
    def __init__(self,data):
        
        self.data = data
        self.cols_to_be_dropped = ['name','club_id','age_cat','nationality']
        self.cols_to_be_encoded = ['club','position','position_cat','region']
        
    def addPosCat(self,inputCat):

        if inputCat == 1:
            return('Attackers')
        elif inputCat == 2:
            return('Midfielders')
        elif inputCat == 3:
            return('Defenders')
        else:
            return('Goalkeeper')
        
    def addRegion(self, inpregion):
    
        if inpregion == 1:
            return('England')
        elif inpregion == 2:
            return('EU')
        elif inpregion == 3:
            return('Americans')
        else:
            return('Rest of World')
        
    def columnTypeConversion(self):
        
        # Converting fpl selection into numeric variable 
        self.data['fpl_sel'] = self.data['fpl_sel'].map(lambda x: str(x)[:-1]).astype('float')
        
    def logTransformation(self):
        
        # log transformation on page views variable as it has a high skew
        self.data['page_views'] = self.data['page_views'].apply(np.log)
        
    # Encoding the categorical variables using pandas dummies 
    def dataEncoding(self):

        self.data = pd.get_dummies(self.data, columns = self.cols_to_be_encoded, drop_first = True)
        
    def getProcessedData(self):
        self.data = self.data.drop(self.cols_to_be_dropped, inplace = False, axis = 1)
        self.data['position_cat'] = self.data['position_cat'].apply(self.addPosCat)
        self.data['region'] = self.data['region'].apply(self.addRegion)
        self.columnTypeConversion()
        self.dataEncoding()
        self.logTransformation()
        return self.data

In [6]:
data_obj = DataPreprocessing(data) 
encoded_data = data_obj.getProcessedData()

In [7]:
encoded_data.head()

Unnamed: 0,age,market_value,page_views,fpl_value,fpl_sel,fpl_points,new_foreign,big_club,new_signing,club_Bournemouth,...,position_RB,position_RM,position_RW,position_SS,position_cat_Defenders,position_cat_Goalkeeper,position_cat_Midfielders,region_EU,region_England,region_Rest of World
0,28,65.0,8.373092,12.0,17.1,264,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,28,50.0,8.388223,9.5,5.6,167,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,35,7.0,7.332369,5.5,5.9,134,0,1,0,0,...,0,0,0,0,0,1,0,1,0,0
3,28,20.0,7.780303,7.5,1.5,122,0,1,0,0,...,0,0,1,0,0,0,0,0,1,0
4,31,22.0,6.81564,6.0,0.7,121,0,1,0,0,...,0,0,0,0,1,0,0,1,0,0


### Train Test Split

In [8]:
output_var = 'market_value'
X = encoded_data[encoded_data.columns[~encoded_data.columns.isin([output_var])]]
y = encoded_data[[output_var]]
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)
x_train = x_train.reset_index(drop = True)
x_test = x_test.reset_index(drop = True)
y_train = y_train.reset_index(drop = True)
y_test = y_test.reset_index(drop = True)

In [9]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(368, 45)
(93, 45)
(368, 1)
(93, 1)


In [10]:
# performin min max scaling on input data
scaler = MinMaxScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

## Model Building

### Linear Regression

In [11]:
# Fitting a Linear Regressor

lin_model = LinearRegression()
lin_model.fit(x_train, y_train)

acc_train = lin_model.score(x_train, y_train)
print("R^2 score of training data - " + str(acc_train))

train_pred = lin_model.predict(x_train) 
mse_train = mean_squared_error(y_train,train_pred)
print("Root Mean Square Error on training data - " + str(mse_train**(0.5)))

R^2 score of training data - 0.7984588012213261
Root Mean Square Error on training data - 5.609736514135708


In [12]:
# Function to calculated cross validated scores

def KFoldVerify(model, X, Y):
    cv = KFold(n_splits=5, random_state=1, shuffle=True)
    r2_scores = cross_val_score(model, X, Y, cv= cv, scoring = 'r2')
    return r2_scores.mean()

### Lasso Regression

In [13]:
# Fitting a Lasso Regressor
lasso_model = Lasso(alpha=0.1, max_iter = 10000)
cross_acc_train = KFoldVerify(lasso_model, x_train, y_train)
lasso_model.fit(x_train, y_train)

acc_train = lasso_model.score(x_train, y_train)
print("Cross Validation score - " + str(cross_acc_train))
print()


train_pred = lasso_model.predict(x_train)
mse_train = mean_squared_error(y_train,train_pred)
print("R^2 score of training data - " + str(acc_train))
print("Root Mean Square Error of training data - " + str(mse_train**(0.5)))

Cross Validation score - 0.70800101695429

R^2 score of training data - 0.7691687876039246
Root Mean Square Error of training data - 6.003545560071556


### Ridge Regression

In [14]:
# Fitting a Ridge Regressor
ridge_model = Ridge(alpha=0.1,max_iter=10000)
cross_acc_train = KFoldVerify(ridge_model, x_train, y_train)
ridge_model.fit(x_train, y_train)

acc_train = ridge_model.score(x_train, y_train)
print("Cross Validation score - " + str(cross_acc_train))
print()


train_pred = ridge_model.predict(x_train)
mse_train = mean_squared_error(y_train,train_pred)
print("R^2 score of training data - " + str(acc_train))
print("Root Mean Square Error of training data - " + str(mse_train**(0.5)))

Cross Validation score - 0.719611654798185

R^2 score of training data - 0.7983626715148147
Root Mean Square Error of training data - 5.611074201020374


### Support Vector Regression

In [15]:
# fitting a support vector regressor
SVR_regr = SVR(C = 10000.0, epsilon=0.001, kernel='poly', degree=3)
cross_acc_train = KFoldVerify(SVR_regr, x_train, y_train)
SVR_regr.fit(x_train, y_train)

acc_train = SVR_regr.score(x_train, y_train)
print("Cross Validation score - " + str(cross_acc_train))
print()


train_pred = SVR_regr.predict(x_train)
mse_train = mean_squared_error(y_train,train_pred)
print("R^2 score of training data - " + str(acc_train))
print("Root Mean Square Error of training data - " + str(mse_train**(0.5)))

Cross Validation score - 0.5994242052525666

R^2 score of training data - 0.9999999924906502
Root Mean Square Error of training data - 0.0010828335709398684


### Nearest Neighbour Regression

In [16]:
# fitting a nearest neighbour regressor
K_neigh = KNeighborsRegressor(n_neighbors=10, metric='minkowski')
cross_acc_train = KFoldVerify(K_neigh, x_train, y_train)
K_neigh.fit(x_train, y_train)

acc_train = K_neigh.score(x_train, y_train)
print("Cross Validation score - " + str(cross_acc_train))
print()


train_pred = K_neigh.predict(x_train)
mse_train = mean_squared_error(y_train,train_pred)
print("R^2 score of training data - " + str(acc_train))
print("Root Mean Square Error of training data - " + str(mse_train**(0.5)))

Cross Validation score - 0.4872238958169596

R^2 score of training data - 0.6192309748764891
Root Mean Square Error of training data - 7.710656397402516


### Decision Tree Regression

In [17]:
# fitting a Decision Tree regressor
tree_model = DecisionTreeRegressor(random_state = 0,max_depth=5)  
cross_acc_train = KFoldVerify(tree_model, x_train, y_train)
tree_model.fit(x_train, y_train)

acc_train = tree_model.score(x_train, y_train)
print("Cross Validation score - " + str(cross_acc_train))
print()


train_pred = tree_model.predict(x_train)
mse_train = mean_squared_error(y_train,train_pred)
print("R^2 score of training data - " + str(acc_train))
print("Root Mean Square Error of training data - " + str(mse_train**(0.5)))

Cross Validation score - 0.5754553540643244

R^2 score of training data - 0.8523009976393502
Root Mean Square Error of training data - 4.802300424917221


### Random Forest Regression

In [18]:
# fitting a random forest regressor
Forest_regr = RandomForestRegressor(max_depth=5, random_state=0)
cross_acc_train = KFoldVerify(Forest_regr, x_train, y_train)
Forest_regr.fit(x_train, y_train)

acc_train = Forest_regr.score(x_train, y_train)
print("Cross Validation score - " + str(cross_acc_train))
print()


train_pred = Forest_regr.predict(x_train)
mse_train = mean_squared_error(y_train,train_pred)
print("R^2 score of training data - " + str(acc_train))
print("Root Mean Square Error of training data - " + str(mse_train**(0.5)))

Cross Validation score - 0.7413468474667345

R^2 score of training data - 0.8982920193028906
Root Mean Square Error of training data - 3.9850893247835057


### Gradient Boosted Regression

In [19]:
params = {'n_estimators': 300,
          'max_depth': 2,
          'min_samples_split': 9,
          'learning_rate': 0.1,
          'loss': 'ls'}
gbr_model = GradientBoostingRegressor(**params)
cross_acc_train = KFoldVerify(gbr_model, x_train, y_train)
gbr_model.fit(x_train, y_train)

acc_train = gbr_model.score(x_train, y_train)
print("Cross Validation score - " + str(cross_acc_train))
print()


train_pred = gbr_model.predict(x_train)
mse_train = mean_squared_error(y_train,train_pred)
print("R^2 score of training data - " + str(acc_train))
print("Root Mean Square Error of training data - " + str(mse_train**(0.5)))

Cross Validation score - 0.7666048660006626

R^2 score of training data - 0.9621126001652733
Root Mean Square Error of training data - 2.4322485047256532


## Hyperparameter Tuning

Grid Search is an effective method for adjusting the parameters in supervised learning and improve the generalization performance of a model. With Grid Search, we try all possible combinations of the parameters of interest and find the best ones.

In [20]:
# For Lasso Regression
params = {'alpha':[0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]}
lasso_reg = GridSearchCV(Lasso(), param_grid = params, cv = 5, scoring = 'r2')
lasso_reg.fit(x_train, y_train)
acc_train = lasso_reg.score(x_train, y_train)

print("Cross Validation score - " + str(lasso_reg.best_score_))
print()

print("R^2 score of training data - " + str(acc_train))
print()

print(lasso_reg.best_estimator_)
lasso_best_params = lasso_reg.best_params_

Cross Validation score - 0.7230786125828159

R^2 score of training data - 0.7977232047794788

Lasso(alpha=0.01)


In [21]:
# For Ridge Regression
params = {'alpha':[0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]}
ridge_reg = GridSearchCV(Ridge(), param_grid = params, cv = 5, scoring = 'r2')
ridge_reg.fit(x_train, y_train)
acc_train = ridge_reg.score(x_train, y_train)

print("Cross Validation score - " + str(ridge_reg.best_score_))
print()

print("R^2 score of training data - " + str(acc_train))
print()

print(ridge_reg.best_estimator_)
ridge_best_params = ridge_reg.best_params_

Cross Validation score - 0.7226409819954439

R^2 score of training data - 0.793773950802908

Ridge(alpha=1)


In [22]:
# For Support Vector Regression
params = [{'C': [1,10,100,1000,10000], 'epsilon': [0.0001,0.001,0.01,0.1,1],
          'kernel': ['poly'], 'degree':[2,3,4]},
          {'C': [1,10,100,1000,10000], 'epsilon': [0.0001,0.001,0.01,0.1,1],
          'kernel': ['rbf']}]

svr_reg = GridSearchCV(SVR(),param_grid = params, scoring = 'r2', cv = 5)
svr_reg.fit(x_train,y_train)

acc_train = svr_reg.score(x_train, y_train)

print("Cross Validation score - " + str(svr_reg.best_score_))
print()

print("R^2 score of training data - " + str(acc_train))
print()

print(svr_reg.best_estimator_)
svr_best_params = svr_reg.best_params_

Cross Validation score - 0.6861136480590018

R^2 score of training data - 0.8622677661166436

SVR(C=10, degree=2, epsilon=1, kernel='poly')


In [23]:
# For Nearest Neighbour Regression
params = {'n_neighbors': [4,5,6,7,8,9,10,11,12,13,14,15], 'weights': ['uniform', 'distance']}

knn_reg = GridSearchCV(KNeighborsRegressor(),param_grid = params, scoring = 'r2', cv = 5)
knn_reg.fit(x_train,y_train)

acc_train = knn_reg.score(x_train, y_train)

print("Cross Validation score - " + str(knn_reg.best_score_))
print()

print("R^2 score of training data - " + str(acc_train))
print()

print(knn_reg.best_estimator_)
knn_best_params = knn_reg.best_params_

Cross Validation score - 0.533396046122322

R^2 score of training data - 0.6711338378019723

KNeighborsRegressor(n_neighbors=4)


In [24]:
# For Decision Tree Regression
params  = {'splitter': ['best', 'random'],'min_samples_leaf': [1, 2 ,3],
           'max_depth': [6,8,10,12,14,16,18]}

tree_reg = GridSearchCV(DecisionTreeRegressor(random_state = 0) ,param_grid = params, scoring = 'r2', cv = 5)
tree_reg.fit(x_train,y_train)

acc_train = tree_reg.score(x_train, y_train)

print("Cross Validation score - " + str(tree_reg.best_score_))
print()

print("R^2 score of training data - " + str(acc_train))
print()

print(tree_reg.best_estimator_)
tree_best_params = tree_reg.best_params_

Cross Validation score - 0.6264179158678577

R^2 score of training data - 0.8506215962714291

DecisionTreeRegressor(max_depth=8, min_samples_leaf=3, random_state=0,
                      splitter='random')


In [25]:
# For Random Forest Regression
params = {
    'bootstrap': [True],
    'max_depth': [8, 10, 12, 14, 16],
    'min_samples_leaf': [1, 2, 3],
    'n_estimators': [40, 50, 60, 70],
    'max_features': [0.3, 0.4, 0.5]
}

forest_reg = GridSearchCV(RandomForestRegressor(), param_grid = params, scoring = 'r2', cv = 5)
forest_reg.fit(x_train,y_train)
acc_train = forest_reg.score(x_train, y_train)

print("Cross Validation score - " + str(forest_reg.best_score_))
print()

print("R^2 score of training data - " + str(acc_train))
print()
print(forest_reg.best_estimator_)
forest_best_params = forest_reg.best_params_

Cross Validation score - 0.761284094907761

R^2 score of training data - 0.9212784494728693

RandomForestRegressor(max_depth=8, max_features=0.4, min_samples_leaf=2,
                      n_estimators=50)


In [26]:
# For Gradient Boosted Regression
params = {'n_estimators': [50, 100, 200, 300],
          'max_depth': [8,10,12,14,16],
          'min_samples_split': [1,2,3],
          'subsample':[0.1, 0.2]}

gb_reg = GridSearchCV(GradientBoostingRegressor() ,param_grid = params, scoring = 'r2', cv = 5)
gb_reg.fit(x_train,y_train)

acc_train = gb_reg.score(x_train, y_train)

print("Cross Validation score - " + str(gb_reg.best_score_))
print()

print("R^2 score of training data - " + str(acc_train))
print()

print(gb_reg.best_estimator_)
gb_best_params = gb_reg.best_params_

Cross Validation score - 0.7473475973479823

R^2 score of training data - 0.8629245529812153

GradientBoostingRegressor(max_depth=10, n_estimators=50, subsample=0.1)


### Genetic Algorithm 

In [27]:
class GeneticAlgorithm :
    
    def __init__(self, num_chromosomes, num_genes):
        self.num_chromosomes = num_chromosomes
        self.num_genes = num_genes
        self.population = np.random.uniform(low=-3.0, high=3.0, size=(self.num_chromosomes, self.num_genes))
        
    def crossOver(self, parent1_idx, parent2_idx):
        Pc_threshold = 0.3
        Pc = np.random.uniform(0,1)
        
        # If crossover probability is less than or equal to threshold then do crossover operation
        if Pc <= Pc_threshold:
            
            # The point at which crossover takes place between two parents. Usually, it is at the center.
            crossover_point = np.uint8(self.num_genes/2)
            
            # flipping the second halves of parent chromosomes
            temp = self.population[parent1_idx, crossover_point:].copy()
            self.population[parent1_idx, crossover_point:] = self.population[parent2_idx, crossover_point:]
            self.population[parent2_idx, crossover_point:] = temp.copy()
            
            

    
    def select_mating_pool(self, fitness_val):
    
        # Selecting the best individuals in the current generation as parents 
        # for producing the offspring of the next generation.
        
        select_probs  = fitness_val + 1
        select_probs = select_probs / select_probs.sum()
        fitness_based_selected =  np.random.choice(fitness_val.squeeze(), size = fitness_val.shape[0], 
                                                   replace = False, p = select_probs.squeeze())
        
        for i in range(fitness_based_selected.shape[0]//2):
            parent_fit_1 = fitness_based_selected[(i*2)]
            parent_fit_2 = fitness_based_selected[(i*2) + 1]
            parent1_idx = np.where(fitness_val == parent_fit_1)[0][0]
            parent2_idx = np.where(fitness_val == parent_fit_2)[0][0]
            
            # Doing Cross over operation
            self.crossOver(parent1_idx, parent2_idx)
            
            # Doing Mutation Operation
            self.mutation(parent1_idx, parent2_idx)
            
    
    # Mutation changes a single gene in each offspring randomly.
    def mutation(self, parent1_idx, parent2_idx):
        Mc_threshold = 0.08
        Mc = np.random.uniform(0,1)
        
        # If mutation probability is less than or equal to threshold then do mutation operation
        if Mc <= Mc_threshold:
            
            # The random value to be added to the gene.
            random_value = np.random.uniform(-1.0, 1.0, 1)
            
            # the random gene whose value is to be changed
            gene_idx = np.random.randint(1,self.num_genes + 1, 1)
            self.population[parent1_idx, gene_idx] = self.population[parent1_idx, gene_idx] + random_value
            self.population[parent2_idx, gene_idx] = self.population[parent2_idx, gene_idx] + random_value
            
            
        
    def fitnessEvaluation(self, x_train, y_train):
        fitness = np.zeros((self.population.shape[0],1))
        for index, item in enumerate(self.population):
            kfold = KFold(n_splits=5, random_state=2)
            neigh = KNeighborsRegressor(n_neighbors= knn_best_params['n_neighbors'],
                                        metric = 'wminkowski', metric_params = {'w':item})
            fitness[index] = cross_val_score(neigh, x_train, y_train, cv=kfold, scoring="r2").mean()
        return fitness
        
    def generation(self, x_train, y_train):
        num_generations = 10
        for generation in range(num_generations):
            # Measuring the fitness of each chromosome in the population
            fitness_val = self.fitnessEvaluation(x_train, y_train)
            # Selecting the best parents in the population for mating.
            self.select_mating_pool(fitness_val)
            
        # returning the most fit chromosome and its fitness from the final population
        return self.population[np.argmax(fitness_val)], np.max(fitness_val)

In [29]:
obj = GeneticAlgorithm(20,45)
wts, fitness = obj.generation(x_train, y_train)
print("The fitness value of fittest chromosome in the final population is "+ str(fitness))

The fitness value of fittest chromosome in the final population is 0.6162343237045949


In [30]:
# Using the weights learnt from the Genetic Algorithm in KNN regressor model to improve on its accuracy
neigh = KNeighborsRegressor(n_neighbors=knn_best_params['n_neighbors'],
                            metric = 'wminkowski', metric_params = {'w':wts})

cross_acc_train = KFoldVerify(neigh, x_train, y_train)
neigh.fit(x_train, y_train)

acc_train = neigh.score(x_train, y_train)
print("Cross Validation score - " + str(cross_acc_train))
print()


train_pred = neigh.predict(x_train)
mse_train = mean_squared_error(y_train,train_pred)
print("R^2 score of training data - " + str(acc_train))
print("Root Mean Square Error of training data - " + str(mse_train**(0.5)))

Cross Validation score - 0.607601504088297

R^2 score of training data - 0.7741979074562638
Root Mean Square Error of training data - 5.937785784026205


### Finalized Models

In [31]:
reports = pd.DataFrame()
reports['Models'] = ['Linear Regression', 'Lasso Regression', 'Ridge Regression', 'Support Vector Regression',
                    'KNearestNeighbour Regression', 'DecisionTree Regression', 'RandomForest Regression',
                    'GradientBoosted Regression']

In [32]:
# Creating objects of each regression model with their best hyperparameter settings

lr = LinearRegression()
ls = Lasso(**lasso_best_params)
rdg = Ridge(**ridge_best_params)
svr = SVR(**svr_best_params)
knn = KNeighborsRegressor(n_neighbors= knn_best_params['n_neighbors'],
                          metric = 'wminkowski', metric_params = {'w':wts})
dec_tree = DecisionTreeRegressor(**tree_best_params)
ran_for = RandomForestRegressor(**forest_best_params)
gbr = GradientBoostingRegressor(**gb_best_params)

models_obj = [lr, ls, rdg, svr, knn, dec_tree, ran_for, gbr]

In [33]:
# Calculating Cross Validation scores of best models
cv = KFold(n_splits= 5,random_state= 1, shuffle=True)
cross_val_scores = list(map(lambda model: round(cross_val_score(model,x_train, y_train, 
                                                                scoring = 'r2', cv = cv).mean(),2), models_obj))
reports['Cross validated R^2 score'] = cross_val_scores

In [34]:
# function to calculate r2 score and rmse on training data
def scores_calculation(model):
    
    model.fit(x_train, y_train)
    r2_score = round(model.score(x_train, y_train),2)
    train_pred = model.predict(x_train)
    rmse = round(mean_squared_error(y_train, train_pred)**0.5, 2)
    return r2_score, rmse

scores = list(map(scores_calculation, models_obj))
reports['Training R^2 scores'] = list(zip(*scores))[0]
reports['Training RMSE'] = list(zip(*scores))[1]

In [35]:
reports

Unnamed: 0,Models,Cross validated R^2 score,Training R^2 scores,Training RMSE
0,Linear Regression,0.72,0.8,5.61
1,Lasso Regression,0.72,0.8,5.62
2,Ridge Regression,0.72,0.79,5.67
3,Support Vector Regression,0.7,0.86,4.64
4,KNearestNeighbour Regression,0.61,0.77,5.94
5,DecisionTree Regression,0.61,0.87,4.43
6,RandomForest Regression,0.76,0.92,3.45
7,GradientBoosted Regression,0.72,0.87,4.48


## Conclusion
Clearly the best performing model is RandomForestRegression which has highest cross validation score

### Saving the best model and scaler 

In [40]:
import pickle
pickle.dump(ran_for, open('./model/model.pkl', 'wb'))
pickle.dump(scaler, open('./model/scaler.pkl', 'wb'))

In [39]:
acc = ran_for.score(x_test, y_test)
print('R^2 score on test data - '+str(acc))

R^2 score on test data - 0.8702729321930357
