In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt

import warnings 
warnings.filterwarnings(action= 'ignore')


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df=pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
df.head(10)

In [None]:
df.shape

In [None]:
df.info()

### Realtion between Missing values and SalesPrice

In [None]:
features_with_na = [features for features in df.columns if df[features].isnull().sum()>0]
print(len(features_with_na))

for feature in features_with_na:
    print(feature,'-', np.round(df[feature].isnull().mean(),3)*100, '%')

In [None]:
imp_features = []
for features in df.columns:
    if df[features].isnull().mean()>0.06:
        imp_features.append(features)


### Relation b/w Nan and non_Nan values.
* we take median instead of mean as we know this dataset contains outliers

In [None]:
for feature in imp_features:
    df1=df.copy()
    df1[feature] = np.where(df1[feature].isnull(),1,0)
    
    #  plot mean SalesPrice for missing data
    df1.groupby(feature)['SalePrice'].median().plot.bar()
    plt.title(feature)
    plt.show()

# for feature in imp_features:
#     df1=df.copy()
#     df1[feature] = np.where(df1[feature].isnull(),1,0)
    
#     fig, (ax_before, ax_after) = plt.subplots(1, 2, figsize=(10, 5))
#     df1.groupby(feature)['SalePrice'].median().plot.bar(ax=ax_before)
#     df1.groupby(feature)['SalePrice'].mean().plot.bar(ax=ax_after)
#     ax_before.set(title=feature+'_median')
#     ax_after.set(title=feature+'_mean')
#     plt.show()

#### Numerical Variables

In [None]:
df.dtypes.value_counts()

In [None]:
num_features = [features for features in df.columns if df[features].dtypes !='O']
print('numerical varable: ', len(num_features) )

In [None]:
yr_features = [feature for feature in num_features if 'Year'in feature or 'Yr' in feature]
yr_features

#### Visulaize different Years featutres

In [None]:
df.groupby('YrSold')['SalePrice'].median().plot()
plt.xlabel('Yr_Sold')
plt.ylabel('Sale_Price')
plt.title('yr_sold vs Sale_price')


* House_price is `decreasing` as year is increasing; that's wierd.
* let's see other year *features*

In [None]:
for feature in yr_features:
    if feature != 'YrSold':
        df1=df.copy()

        # We calculate no. of yrs of variable before year_sold    
        df1[feature] = df1['YrSold'] - df1[feature] 
        plt.scatter(df1[feature],df1['SalePrice'])
#         plt.xlabel(feature)
#         plt.ylabel('SalePrice')
        plt.show()


### Differentiating numerical variables into discrete variables and continuous variables

In [None]:
discrete_features = [feature for feature in num_features if len(df[feature].unique())<25 and feature not in yr_features+['Id']]
continuous_features = [feature for feature in num_features if len(df[feature].unique())>=25 and feature not in yr_features+['Id']]

print('Total numerical variables: ', len(num_features))
print('discrete variables: ', len(discrete_features))
print('continuous variables:', len(continuous_features))
print('year features: ', len(yr_features))

#### see relation between discrete features and SalePrice

In [None]:
list(enumerate(discrete_features))

In [None]:
for index,feature in enumerate(discrete_features):
    df1=df.copy()
#     plt.subplots(row,cols,position(starts from 1))

    df1.groupby(feature)['SalePrice'].median().plot.bar()
#     plt.xlabel(feature)
#     plt.ylabel('Sales_Price')
    plt.title(feature)
    plt.show()

* We can see there are some features that are related with Sales Price

In [None]:
for feature in continuous_features:
    df1=df.copy()
    sns.displot(df[feature],bins=25)
#     plt.xlabel(feature)
#     plt.ylabel("Count")
    plt.title(feature)
    plt.show()

* Majority of continuous features does not possess gausian distribution. Let's apply some transformation on that features.

## Transforming Variables 

#### will be using Logarithmic transformation

In [None]:
for feature in continuous_features:
    df1=df.copy()
    if 0 in df1[feature].unique():
        pass
    else:
        df1[feature] = np.log(df1[feature])
        df1['SalePrice'] = np.log(df1['SalePrice'])
        sns.regplot(data=df1,x=df1[feature],y=df1['SalePrice'])
        plt.xlabel(feature)
        plt.ylabel('Sales_Price')
        plt.title(feature)
        plt.show()

## Outliers (only works for continuous features)

In [None]:
from scipy.stats import skew

skewed = []
for feature in continuous_features:
    df1=df.copy()
    if 0 in df1[feature].unique():
        pass
    else:
        df1[feature] = np.log(df1[feature])
        sns.boxplot(df[feature])
        print(feature,': ', skew(df[feature]))
        skewed.append(skew(df[feature])) 
        plt.title(feature)
        plt.show()

## Handling Outliers

* LotFrontage is normally distributed --> outliers are (mean + 3SD)
* LotArea,1stFlrSF,GrLivArea are Skewed --> outliers are out of IQR


#### Handling Outliers for LotFrontage (Normally distributed)

In [None]:
df['LotFrontage'].describe()

In [None]:
uppper_boundary=df['LotFrontage'].mean() + 3* df['LotFrontage'].std()
lower_boundary=df['LotFrontage'].mean() - 3* df['LotFrontage'].std()
print(lower_boundary), print(uppper_boundary),print(df['LotFrontage'].mean())


In [None]:
df.loc[df['LotFrontage']>uppper_boundary,'LotFrontage']=uppper_boundary
sns.displot(df['LotFrontage'])

#### Handling Outliers for LotArea (skewed)

In [None]:
df['LotArea'].describe()

In [None]:
IQR=df.LotArea.quantile(0.75)-df.LotArea.quantile(0.25)

lower_bridge=df['LotArea'].quantile(0.25)-(IQR*1.5)
upper_bridge=df['LotArea'].quantile(0.75)+(IQR*1.5)
print(lower_bridge), print(upper_bridge)

In [None]:
df.loc[df['LotArea']>upper_bridge,'LotArea']=upper_bridge
df.loc[df['LotArea']<lower_bridge,'LotArea']=upper_bridge
sns.displot(df['LotArea'])


#### Handling Outliers for 1stFlrSF (skewed)

In [None]:
df['1stFlrSF'].describe()

In [None]:
IQR=df['1stFlrSF'].quantile(0.75)-df['1stFlrSF'].quantile(0.25)

lower_bridge=df['1stFlrSF'].quantile(0.25)-(IQR*1.5)
upper_bridge=df['1stFlrSF'].quantile(0.75)+(IQR*1.5)
print(lower_bridge), print(upper_bridge)

In [None]:
df.loc[df['1stFlrSF']>upper_bridge,'1stFlrSF']=upper_bridge
df.loc[df['1stFlrSF']<lower_bridge,'1stFlrSF']=upper_bridge
sns.displot(df['1stFlrSF'])

#### Handling Outliers for GrLivArea (skewed)


In [None]:
df['GrLivArea'].describe()

In [None]:
IQR=df['GrLivArea'].quantile(0.75)-df['GrLivArea'].quantile(0.25)

lower_bridge=df['GrLivArea'].quantile(0.25)-(IQR*1.5)
upper_bridge=df['GrLivArea'].quantile(0.75)+(IQR*1.5)
print(lower_bridge), print(upper_bridge)

In [None]:
df.loc[df['GrLivArea']>upper_bridge,'1stFlrSF']=upper_bridge
df.loc[df['GrLivArea']<lower_bridge,'1stFlrSF']=upper_bridge
sns.displot(df['GrLivArea'])

# Categorical Variables


In [None]:
categorical_features = [features for features in df.columns if df[features].dtypes=='O']
len(categorical_features)

* First yo have to focus on cardinality of each categorical feature. means how many no. of category you have in that feature.

In [None]:
for feature in categorical_features:
    print(feature,': ',len(df[feature].unique()))

#### Realtion b/w categorical variables and dependent variable

In [None]:
 for feature in categorical_features:
        df1=df.copy()
        df.groupby(feature)['SalePrice'].median().plot.bar()
        plt.xlabel(feature)
        plt.ylabel('Sales Price')
        plt.title(feature)
        plt.show()

# FEATURE ENGINEERING

In [None]:
df.shape

### Always split data into train/test before applying feature engineering to avoid data leakage.
* first do feature engineering for test dataset and apply same procedure for test dataset

In [None]:
from sklearn.model_selection import train_test_split
train,test = train_test_split(df,test_size=0.2,random_state=42)

In [None]:
train.shape, test.shape

# Feature Engineering for train dataset

## Handling missing Values
### 1. For categorical features

In [None]:
cat_nan_features = [feature for feature in train.columns if train[feature].dtypes=='O' and train[feature].isnull().sum()>0]
len(cat_nan_features)

In [None]:
for feature in cat_nan_features:
    print('{}: {}% missing values'.format(feature,np.round(train[feature].isnull().mean()*100,2)))


In [None]:
def replace_missing(df,nan_features):
    data = df.copy()
    data[nan_features]=data[nan_features].fillna('missing')
    return data

In [None]:
train = replace_missing(train,cat_nan_features)
train[cat_nan_features].isnull().sum()

In [None]:
train[cat_nan_features].head()

### 2. Handling missing values for numerical features

In [None]:
num_nan_features = [feature for feature in train.columns if train[feature].dtypes!='O' and train[feature].isnull().sum()>0]
len(num_nan_features)

In [None]:
for feature in num_nan_features:
    print('{}: {}% missing values'.format(feature,np.round(train[feature].isnull().mean()*100,2)))

#### Creates new cols to capture nan value before replacing it with median/mode

In [None]:
for feature in num_nan_features:
    median=train[feature].median()
    
    train[feature+"_nan"] = np.where(train[feature].isnull(),1,0)
    train[feature].fillna(median,inplace=True)
    
train[num_nan_features].isnull().sum()

### Handling Temporal Variables (date/time variables)

In [None]:
for feature in ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt']:
    train[feature] = train['YrSold']-train[feature]

In [None]:
train[['YearBuilt', 'YearRemodAdd', 'GarageYrBlt']].head()

### Applying log normal distribution on skewed features

In [None]:
skewed_features = ['LotFrontage',  'LotArea',  '1stFlrSF', 'GrLivArea', 'SalePrice']

for feature in skewed_features:
    train[feature] = np.log(train[feature])

### Handling rare categorical features
#### we will remove categorical features that are present less than 1% of the observation

In [None]:
for feature in categorical_features:
    temp = train.groupby(feature)['SalePrice'].count()/len(train)
    # we groupby different feature and calc count of SalePrice for each category in each feature
    # after dividing by len(df); we get fraction of each category
    
    temp_df = temp[temp>0.01].index
    train[feature] = np.where( train[feature].isin(temp_df), train[feature], 'Rare_cat' )

In [None]:
train.head()

### Using label encoding
First sort different category of feaures according to mean_SalePrice of each category and then label them with respective value 

In [None]:
for feature in categorical_features:
    labels_ordered = train.groupby(feature)['SalePrice'].mean().sort_values().index
    labels_ordered={k:i for i,k in enumerate(labels_ordered,0)}
    train[feature]=train[feature].map(labels_ordered)

In [None]:
train.head()

## Feature Scaling

In [None]:
from sklearn.preprocessing import MinMaxScaler as MMS

scaled_features = [feature for feature in train.columns if feature not in ['Id','SalePrice'] ]
scaler = MMS()
scaler.fit(train[scaled_features])

In [None]:
train_data = pd.concat([train[['Id','SalePrice']].reset_index(drop=True),
                     pd.DataFrame(scaler.transform(train[scaled_features]), columns=scaled_features)], axis=1)

In [None]:
train_data.head(10)

In [None]:
x_train = train_data.drop(['Id','SalePrice'], axis=1)
y_train = train_data['SalePrice']
x_train.shape , y_train.shape

In [None]:
train_data.to_csv('train_x.csv', index=False)

## Feature Engineering for Test dataset

In [None]:
cat_nan_features = [feature for feature in test.columns if test[feature].dtypes=='O' and test[feature].isnull().sum()>0]

test = replace_missing(test,cat_nan_features)
test[cat_nan_features].isnull().sum()

num_nan_features = [feature for feature in test.columns if test[feature].dtypes!='O' and test[feature].isnull().sum()>0]

for feature in num_nan_features:
    median=test[feature].median()
    
    test[feature+"_nan"] = np.where(test[feature].isnull(),1,0)
    test[feature].fillna(median,inplace=True)
    
test[num_nan_features].isnull().sum()


for feature in ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt']:
    test[feature] = test['YrSold']-test[feature]
    
skewed_features = ['LotFrontage',  'LotArea',  '1stFlrSF', 'GrLivArea', 'SalePrice']

for feature in skewed_features:
    test[feature] = np.log(test[feature])
    
    
for feature in categorical_features:
    temp = test.groupby(feature)['SalePrice'].count()/len(test)
    temp_df = temp[temp>0.01].index
    test[feature] = np.where( test[feature].isin(temp_df), test[feature], 'Rare_cat' )

    
for feature in categorical_features:
    labels_ordered = test.groupby(feature)['SalePrice'].mean().sort_values().index
    labels_ordered={k:i for i,k in enumerate(labels_ordered,0)}
    test[feature]=test[feature].map(labels_ordered)
    

scaled_features = [feature for feature in test.columns if feature not in ['Id','SalePrice'] ]
scaler = MMS()
scaler.fit(test[scaled_features])

test_data = pd.concat([test[['Id','SalePrice']].reset_index(drop=True),
                     pd.DataFrame(scaler.transform(test[scaled_features]), columns=scaled_features)], axis=1)

In [None]:
x_test = test.drop(['Id','SalePrice'], axis=1)
y_test = test['SalePrice']
x_test.shape , y_test.shape

## Feature Selection



In [None]:
x_train.head()

In [None]:
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel
# SelectFromModel selects features from model whose. coefficient is non-zero

In [None]:
model = SelectFromModel(Lasso(alpha=0.005, random_state=0))
# remeber the seed value(random_state value); you have to use same value for test dataset
model.fit(x_train,y_train)

#### get_support():- True indicates, that feature is important and that should be used

In [None]:
model.get_support()

Let's print total features and selected features

In [None]:
selected_features = x_train.columns[(model.get_support())]
selected_features

In [None]:
print('Total features: {}'.format(x_train.shape[1]))
print('selected features: {}'.format(len(selected_features)))
print("features with coeffecient zero: {}".format(np.sum(model.estimator_.coef_ == 0)))

In [None]:
train_x = x_train[selected_features]
train_y = train_data['SalePrice']

test_x = x_test[selected_features]
test_y = test_data['SalePrice']

train_x.shape , train_y.shape

In [None]:
train_x.corr()

* We apply correlatioon after removing some features because it is very difficult to see correlation for 81 features together

In [None]:
plt.figure(figsize=(18,18))
sns.heatmap(train_x.corr(),annot=True, cmap='RdYlGn')

## Visualize Important Features

In [None]:
from sklearn.ensemble import ExtraTreesRegressor
import matplotlib.pyplot as plt
model = ExtraTreesRegressor()
model.fit(train_x,train_y)

feat_importances = pd.Series(model.feature_importances_, index=train_x.columns)
feat_importances.nlargest(5).plot(kind='barh')
plt.show()

# Model Creation

## 1. Linear Regression


In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score 
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

lin_reg = LinearRegression()
lin_reg.fit(train_x,train_y)
y_pred = lin_reg.predict(test_x)

Coefficients of this model are not of much use as other model performs better than linear regression

In [None]:
coef_df = pd.DataFrame(lin_reg.coef_, test_x.columns, 
                       columns=['Coefficients'])
coef_df

In [None]:
from sklearn import metrics

linear_score = cross_val_score(lin_reg,test_x,test_y,cv=5)
print('Cross Val Score: ', linear_score.mean())
print('R2 score:',r2_score(test_y, y_pred))
print('Mean Absolute Error:', mean_absolute_error(test_y, y_pred))
print('Mean Squared Error:',mean_squared_error(test_y, y_pred))

linear_RMSE = np.sqrt(mean_squared_error(test_y, y_pred))
print('Root Mean Squared Error:', linear_RMSE )


## 2. Ridge Regression

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge


kf = KFold(shuffle=True, random_state=0, n_splits=3)

estimator = Pipeline([("polynomial_features", PolynomialFeatures()),
                      ("ridge_regression", Ridge())])

params = {
    'polynomial_features__degree': [1, 2, 3],
    'ridge_regression__alpha': [1e-15,1e-10,1e-8,1e-3,0.005,1e-2,0.05,0.1,0.5,1,5,10,20,30,35,40]
}
# 'alpha':[1e-15,1e-10,1e-8,1e-3,0.005,1e-2,0.05,0.1,0.5,1,5,10,20,30,35,40]

grid = GridSearchCV(estimator, params, cv=kf)

grid.fit(train_x,train_y)
print('best score: {}'.format(grid.best_score_))
print('best score: {}'.format(grid.best_params_))


* Best value of alpha for Ridge is 0.1

In [None]:
ridge = Ridge(alpha=0.1)

ridge.fit(train_x,train_y)
y_pred = ridge.predict(test_x)

ridge_score = cross_val_score(ridge,test_x,test_y,cv=10)
print('Cross Val Score: ', ridge_score.mean())

ridge_RMSE = np.sqrt(mean_squared_error(test_y, y_pred))
print('Root Mean Squared Error:', ridge_RMSE)



## 3. LASSO Regression

In [None]:
from sklearn.model_selection import GridSearchCV

lasso=Lasso()
parameters={'alpha':[1e-15,1e-10,1e-8,1e-3,0.005,1e-2,0.05,0.1,0.5,1,5,10,20,30,35,40]}
lasso_regressor=GridSearchCV(lasso,parameters,cv=kf)

lasso_regressor.fit(x_train,y_train)
print(lasso_regressor.best_params_)
print(lasso_regressor.best_score_)

* Best value of alpha for Lasso is 0.001

In [None]:
lasso = Lasso(alpha=0.001)

lasso.fit(train_x,train_y)
y_pred = lasso.predict(test_x)

lasso_score = cross_val_score(lasso,test_x,test_y,cv=10)
print('Cross Val Score: ', lasso_score.mean())

lasso_RMSE = np.sqrt(mean_squared_error(test_y, y_pred))
print('Root Mean Squared Error:', lasso_RMSE)


### Compare Linear/Ridge/Lasso

In [None]:
print('Linear Regression score: {} , RMSE: {}'.format(linear_score.mean(),linear_RMSE))
print('Ridge Regression score: {} , RMSE: {}'.format(ridge_score.mean(),ridge_RMSE))
print('Lasso Regression score: {} , RMSE: {}'.format(lasso_score.mean() ,lasso_RMSE))

#### Lasso Regression performs better than Linear and Ridge Regression
* it has highest CV score score and least RMSE

## 4. K-Nearest-Neighbour Regressor


In [None]:
from sklearn.neighbors import KNeighborsRegressor

accuracy=[]
k=0
temp=0
for n in range(1,50):
    knn=KNeighborsRegressor(n_neighbors=n)
    result= cross_val_score(knn, test_x, test_y, cv=10)
    accuracy.append(result.mean())
    if (result.mean() > temp):
        temp=result.mean()
        k=n
    
    
plt.figure(figsize=(10,8))
axes = plt.axes()
axes.grid()
plt.plot(range(1,50), accuracy, color='blue', linestyle=':',
        marker='o', markerfacecolor='red',markersize=10 )
plt.title('accuracy vs K-value')
plt.xlabel('K-value')
plt.xticks(range(0,50,2))
plt.ylabel('accuracy')

print('Best K-value: {}'.format(k))

In [None]:
knn=KNeighborsRegressor(n_neighbors=3)
knn.fit(train_x,train_y)
y_pred = knn.predict(test_x)

knn_score = cross_val_score(knn,test_x,test_y,cv=10)
print('Cross Val Score: ', knn_score.mean())

knn_RMSE = np.sqrt(mean_squared_error(test_y, y_pred))
print('Root Mean Squared Error:', knn_RMSE)



## 5. SVM Regressor

In [None]:
from sklearn.svm import SVR

svm_regressor = SVR()
parameters = {'kernel':['poly','rbf'] , 'degree':[2,3,4,5] }
svm_reg = GridSearchCV(svm_regressor,parameters,cv=kf)

svm_reg.fit(train_x,train_y)
print('Best Parameters: {}'.format(svm_reg.best_params_))
print('Best SCore: {}'.format(svm_reg.best_score_))

In [None]:
svm_regressor = SVR(kernel='rbf', degree=2)
svm_regressor.fit(train_x,train_y)
y_pred = svm_regressor.predict(test_x)

SVM_score = cross_val_score(svm_regressor,test_x,test_y,cv=10)
print('Cross Val Score: ', SVM_score.mean())

SVM_RMSE = np.sqrt(mean_squared_error(test_y, y_pred))
print('Root Mean Squared Error:', SVM_RMSE)

## 6. Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

rf = RandomForestRegressor()

In [None]:
 #Randomized Search CV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1200, num = 12)] 

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 30, num = 6)]
# max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 15, 100]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 5, 10]
# Method of selecting samples for training each tree
# bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

In [None]:
# rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid,
#                                n_iter = 100, cv = 5, verbose=2, random_state=42, n_jobs = 1)

# rf_random.fit(train_x,train_y)

In [None]:
# print(rf_random.best_params_)
# print('score: {}' .format(rf_random.best_score_))

#### Best parameters for Random forest are :
* 'n_estimators': 600, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 20


In [None]:
rf = RandomForestRegressor(n_estimators=500, min_samples_split=2, 
                           min_samples_leaf=1, max_features='sqrt', 
                           max_depth=15)
rf.fit(train_x,train_y)
y_pred = rf.predict(test_x)

RF_score = cross_val_score(rf,test_x,test_y,cv=10)
print('Cross Val Score: ', RF_score.mean())

RF_RMSE = np.sqrt(mean_squared_error(test_y, y_pred))
print('Root Mean Squared Error:', RF_RMSE)

In [None]:
print('Linear Regression score: {} , RMSE: {}'.format(linear_score.mean(),linear_RMSE))
print('Ridge Regression score: {} , RMSE: {}'.format(ridge_score.mean(),ridge_RMSE))
print('Lasso Regression score: {} , RMSE: {}'.format(lasso_score.mean() ,lasso_RMSE))
print('KNN Regression score: {} , RMSE: {}'.format(knn_score.mean() ,knn_RMSE))
print('SVM Regression score: {} , RMSE: {}'.format(SVM_score.mean() ,SVM_RMSE))
print('Random Forest score: {} , RMSE: {}'.format(RF_score.mean() ,RF_RMSE))