In [None]:
import pandas as pd
import numpy as np
 
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from scipy.optimize import curve_fit
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
import xgboost as xgb

#For evaluation
from sklearn.metrics import mean_squared_log_error as msle
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('airbnb.csv', low_memory=False)
print(df.shape)
df.dtypes
airbnb = df.copy()

In [None]:
airbnb.head()
airbnb = airbnb.loc[airbnb.city == 'NYC']

In [None]:
# Drop bad rows with 0 in important features
airbnb = airbnb.loc[airbnb.beds * airbnb.accommodates * airbnb.bathrooms * airbnb.bedrooms != 0]
# Drop those with bad target variable values
airbnb = airbnb.loc[airbnb.log_price > 0]

In [None]:
# Cleaning dataset columns

In [None]:
# Drop columns that will not be helpful for price prediction
airbnb.drop(['id','description','host_has_profile_pic','name','thumbnail_url','first_review','host_since','last_review','city','zipcode','host_response_rate'],axis=1,inplace=True)

In [None]:
# Check correlation to keep right features
corrs = airbnb.corr()
plt.subplots(figsize=(8,6))
sns.heatmap(corrs, vmax=0.8, square=True)

In [None]:
# Check nulls
airbnb.isnull().sum()
# Drop review_scores_rating as it has too many nulls
airbnb.drop(['review_scores_rating'],axis=1,inplace=True)

In [None]:
# Fill remaining nulls for baths with mean value
airbnb['bathrooms'].fillna((airbnb['bathrooms'].mean()), inplace=True)
airbnb.isnull().sum()

In [None]:
# Remove those listings with unknown neighbourhood
airbnb.dropna(subset=['neighbourhood'], inplace=True)
airbnb.isnull().sum()

In [None]:
# Fill bedrooms with 0 for nulls
airbnb['beds'] = airbnb['beds'].fillna(0)
airbnb['bedrooms'] = airbnb['bedrooms'].fillna(0)
airbnb.isnull().sum()

In [None]:
# Feature engineering

In [None]:
# Convert binary categories to vals
airbnb.cleaning_fee = airbnb.cleaning_fee.astype('category').cat.codes
airbnb.host_identity_verified = airbnb.host_identity_verified.astype('category').cat.codes
airbnb.instant_bookable = airbnb.instant_bookable.astype('category').cat.codes
# Make sure there are no more nulls
airbnb.isnull().sum()

In [None]:
# Retain 1 DF with the categoricals for any model that may require them
# Convert room_type categorical
airbnb.room_type.value_counts()
room_dummies = pd.get_dummies(airbnb.room_type,prefix='room').iloc[:,1:]
airbnb_dummies = pd.concat([airbnb,room_dummies],axis=1)

In [None]:
# Drop the originals
airbnb.drop(['room_type'],axis=1,inplace=True)
airbnb_dummies.drop(['room_type'],axis=1,inplace=True)

In [None]:
# Convert property_type categorical
airbnb.property_type.value_counts()
# Based on value counts, change property_type into 5 categories and create dummies 
property_type_map = {'Apartment':['Condominium','Loft','Serviced apartment','Guest suite'],
         'House':['Vacation home','Villa','Townhouse','In-law','Casa particular'],
         'Hostel':['Dorm','Hostel','Guesthouse'],
         'Hotel':['Boutique hotel','Bed & Breakfast'],
         'Timeshare':['Timeshare'],
         'Other':['Island','Castle','Yurt','Hut','Chalet','Treehouse',
                  'Earth House','Tipi','Cave','Train','Parking Space','Lighthouse',
                 'Tent','Boat','Cabin','Camper/RV','Bungalow']
        }
property_type_group = {i : k for k, v in property_type_map.items() for i in v}
airbnb['property_group'] = airbnb['property_type'].replace(property_type_group)
airbnb.drop('property_type',axis=1,inplace=True)
# Convert to dummies
airbnb_dummies['property_group'] = airbnb_dummies['property_type'].replace(property_type_group)
property_dummies = pd.get_dummies(airbnb_dummies.property_group,prefix='property').iloc[:,1:]
airbnb_dummies = pd.concat([airbnb_dummies,property_dummies],axis=1)

In [None]:
# Drop the originals
airbnb_dummies.drop('property_type',axis=1,inplace=True)
airbnb_dummies.drop('property_group',axis=1,inplace=True)

In [None]:
# Check cancellation policy
airbnb.cancellation_policy.value_counts()
# Based on value counts, ignore 30, 60 as they are negligible
airbnb = airbnb[airbnb.cancellation_policy != ('super_strict_30')]
airbnb = airbnb[airbnb.cancellation_policy != ('super_strict_60')]
airbnb_dummies = airbnb_dummies[airbnb_dummies.cancellation_policy != ('super_strict_30')]
airbnb_dummies = airbnb_dummies[airbnb_dummies.cancellation_policy != ('super_strict_60')]
# Convert rest to dummies
cancellation_dummies = pd.get_dummies(airbnb_dummies.cancellation_policy,prefix='cancellation').iloc[:,1:]
airbnb_dummies = pd.concat([airbnb_dummies,cancellation_dummies],axis=1)

In [None]:
# Drop the original
airbnb_dummies.drop('cancellation_policy',axis=1,inplace=True)

In [None]:
# Convert bed_type categorical
airbnb.bed_type.value_counts()
# Change to binary - real bed or not
def bed_type_func(row):
  if row.loc['bed_type'] == 'Real Bed':
    return 1
  else:
    return 0

airbnb['real_bed'] = airbnb.apply(bed_type_func, axis=1)
airbnb.drop('bed_type',axis=1,inplace=True)
airbnb_dummies['real_bed'] = airbnb_dummies.apply(bed_type_func, axis=1)
airbnb_dummies.drop('bed_type',axis=1,inplace=True)

In [None]:
# Use lat, long to find distance to prime locations such as times square, train station
airbnb['latitude_north'] = (airbnb.latitude - airbnb.latitude.min()) / (airbnb.latitude.max() - airbnb.latitude.min())
airbnb['longitude_east'] = (airbnb.longitude - airbnb.longitude.min()) / (airbnb.longitude.max() - airbnb.longitude.min())
airbnb_dummies['latitude_north'] = (airbnb_dummies.latitude - airbnb_dummies.latitude.min()) / (airbnb_dummies.latitude.max() - airbnb_dummies.latitude.min())
airbnb_dummies['longitude_east'] = (airbnb_dummies.longitude - airbnb_dummies.longitude.min()) / (airbnb_dummies.longitude.max() - airbnb_dummies.longitude.min())

In [None]:
airbnb['dist_t_squre'] = np.sqrt((40.758896-airbnb['latitude'])**2+(-73.985130-airbnb['longitude'])**2)
airbnb['dist_gc_train'] = np.sqrt((40.752655-airbnb['latitude'])**2+(-73.977295-airbnb['longitude'])**2)
airbnb['dist_w_street']=np.sqrt((40.706005-airbnb['latitude'])**2+(-74.008827-airbnb['longitude'])**2)

airbnb_dummies['dist_t_squre'] = np.sqrt((40.758896-airbnb_dummies['latitude'])**2+(-73.985130-airbnb_dummies['longitude'])**2)
airbnb_dummies['dist_gc_train'] = np.sqrt((40.752655-airbnb_dummies['latitude'])**2+(-73.977295-airbnb_dummies['longitude'])**2)
airbnb_dummies['dist_w_street']=np.sqrt((40.706005-airbnb_dummies['latitude'])**2+(-74.008827-airbnb_dummies['longitude'])**2)

In [None]:
# Drop originals
airbnb.drop(['latitude','longitude'],axis=1,inplace=True)
airbnb_dummies.drop(['latitude','longitude'],axis=1,inplace=True)

In [None]:
# Use neighbuorhood to create levels based on cost per room
airbnb['price_per_room'] = airbnb['log_price'] / airbnb['bedrooms']
nhood_avg_price = airbnb[['neighbourhood','price_per_room']].groupby('neighbourhood')['price_per_room'].mean().sort_values()

In [None]:
nhood_avg_price.replace(np.inf, np.nan,inplace=True)
nhood_avg_price.fillna(nhood_avg_price.mean(),inplace=True)
nhood_class_df = nhood_avg_price.to_frame()
def neigbourhood_class(row):
  if row['price_per_room'] >=0 and row['price_per_room'] <= 3.683610:
    return 1
  elif row['price_per_room'] > 3.6836100 and row['price_per_room'] <= 3.868928:
    return 2
  elif row['price_per_room'] >3.868928 and row['price_per_room'] <= 4.194452: 
    return 3
  else:
    return 4
  
nhood_class_df['neigbourhood_level'] = nhood_class_df.apply(neigbourhood_class,axis=1)
nhood_class_df.drop('price_per_room',axis=1,inplace=True)
airbnb = airbnb.join(nhood_class_df,on='neighbourhood')
airbnb_dummies = airbnb_dummies.join(nhood_class_df,on='neighbourhood')

In [None]:
# Drop neighborhood originals
airbnb.drop(['neighbourhood'],axis=1,inplace=True)
airbnb_dummies.drop(['neighbourhood'],axis=1,inplace=True)

In [None]:
import plotly.express as px
fig = px.scatter(airbnb, x='longitude', y='latitude',
                 color='neigbourhood_level', width=800, height=600) # Added color to previous basic 
fig.update_layout(xaxis_title="longitude",yaxis_title="latitude")
fig.show()

In [None]:
# Handle list of amenities
l=list(airbnb['amenities'])
l=[[word.strip('[" ]') for word in row[1:-1].split(',')] for row in list(airbnb['amenities'])]
cols = set(word for row in l  for word in row)
amenities_df=pd.DataFrame(columns=cols)
print(cols)
amenities_df = pd.DataFrame(columns=cols)
for row_idx in range(len(l)):
    for col in cols:
        amenities_df.loc[row_idx,col]=int(col in l[row_idx])

In [None]:
# Group together the similar ones
amenities_group_df = pd.DataFrame()
amenities_group_df['kitchen'] = amenities_df['Kitchen']+amenities_df['Breakfast']+amenities_df['Cooking basics']+amenities_df['Cooking basics']+amenities_df['BBQ grill']+amenities_df['Oven']+amenities_df['Coffee maker']+amenities_df['Microwave']+amenities_df['Refrigerator']+amenities_df['Dishwasher']
amenities_group_df['accesibility'] = amenities_df['Free parking on premises']+amenities_df['Wide clearance to bed']+amenities_df['smooth pathway to front door']+amenities_df['Ground floor access']+amenities_df['Lake access']+amenities_df['Wheelchair accessible']+amenities_df['Wide clearance to shower & toilet']+amenities_df['Wide hallway clearance']+amenities_df['Wide doorway']+amenities_df['Accessible-height toilet']+amenities_df['Step-free access']+amenities_df['Well-lit path to entrance']+amenities_df['Waterfront']+amenities_df['Free parking on street']+amenities_df['Disabled parking spot']+amenities_df['Accessible-height bed']+amenities_df['Private entrance']+amenities_df['Elevator']
amenities_group_df['electric_tech'] = amenities_df['Wide entryway']+amenities_df['Air conditioning']+amenities_df['Ethernet connection']+amenities_df['Cable TV']+amenities_df['Internet']+amenities_df['EV charger']+amenities_df['Baby monitor']+amenities_df['TV']+amenities_df['Wireless Internet']+amenities_df['Pocket wifi']+amenities_df['Washer']+amenities_df['Dryer']+amenities_df['Keypad']+amenities_df['Game console']+amenities_df['Washer / Dryer']+amenities_df['Hair dryer']
amenities_group_df['facilities'] = amenities_df['Private living room']+amenities_df['Air purifier']+amenities_df['Handheld shower head']+amenities_df['Hot water kettle']+amenities_df['Extra pillows and blankets']+amenities_df['Hot tub']+amenities_df['Pets live on this property']+amenities_df['Heating']+amenities_df['Dishes and silverware']+amenities_df['Patio or balcony']+amenities_df['Bed linens']+amenities_df['First aid kit']+amenities_df['Crib']+amenities_df['Flat']+amenities_df['Laptop friendly workspace']+amenities_df['Buzzer/wireless intercom']+amenities_df['Firm mattress']+amenities_df['Iron']+amenities_df['Changing table']+amenities_df['Hangers']+amenities_df['Roll-in shower with chair']+amenities_df['Gym']+amenities_df['Outlet covers']+amenities_df['Essentials']+amenities_df['Private bathroom']+amenities_df['Baby bath']+amenities_df['Bathtub']+amenities_df['Shampoo']+amenities_df['Beachfront']+amenities_df['Single level home']+amenities_df['Hot water']+amenities_df['High chair']+amenities_df['Bathtub with shower chair']+amenities_df['Pool']+amenities_df['Fixed grab bars for shower & toilet']+amenities_df['Room-darkening shades']+amenities_df['Beach essentials']+amenities_df['Garden or backyard']
amenities_group_df['kids_friendly'] = amenities_df['Babysitter recommendations']+amenities_df['Family/kid friendly']+amenities_df['Children’s books and toys']+amenities_df['Children’s dinnerware']
amenities_group_df['security'] = amenities_df['Window guards']+amenities_df['Stair gates']+amenities_df['Fireplace guards']+amenities_df['Doorman']+amenities_df['Carbon monoxide detector']+amenities_df['Smoke detector']+amenities_df['Table corner guards']+amenities_df['Fire extinguisher']+amenities_df['Lock on bedroom door']+amenities_df['Smart lock']+amenities_df['Lockbox']
amenities_group_df['services'] = amenities_df['Ski in/Ski out']+amenities_df['Cleaning before checkout']+amenities_df['Long term stays allowed']+amenities_df['Other pet(s)']+amenities_df['Cat(s)']+amenities_df['Self Check-In']+amenities_df['24-hour check-in']+amenities_df['Host greets you']+amenities_df['Luggage dropoff allowed']+amenities_df['Pack ’n Play/travel crib']+amenities_df['Pets allowed']+amenities_df['Suitable for events']+amenities_df['Safety card']+amenities_df['Indoor fireplace']+amenities_df['Dog(s)']+amenities_df['Smoking allowed']
amenities_group_df.describe()

In [None]:
# Join it to original data
airbnb['join_key'] = range(0,len(airbnb))
airbnb.index = airbnb['join_key']
airbnb_cleaned = airbnb.join(amenities_group_df)
airbnb_dummies['join_key'] = range(0,len(airbnb_dummies))
airbnb_dummies.index = airbnb_dummies['join_key']
airbnb_dummies_cleaned = airbnb_dummies.join(amenities_group_df)

In [None]:
# Drop the originals
airbnb_cleaned.drop(['amenities'],axis=1,inplace=True)
airbnb_dummies_cleaned.drop(['amenities'],axis=1,inplace=True)

In [None]:
# Trying sentiment analysis
import nltk
nltk.download('stopwords')
# Read the separated description data from other file
df = pd.read_csv('train.csv')
df.shape
df["description"].value_counts()
df.loc[df["description"] == ""]

#Clean and process texts
from nltk.corpus import stopwords
stopwords_list = set(stopwords.words("english"))
punctuations = """!()-![]{};:,+'"\,<>./?@#$%^&*_~Â""" #List of punctuation to remove

def descriptionParse(description):
    splitDescription = description.split() #Split the description into words
    parsedDescription = " ".join([word.translate(str.maketrans('', '', punctuations)) + " " for word in splitDescription]) #Takes the stubborn punctuation out
    return parsedDescription #Returns the parsed description
  
def clean_description(description):
    clean_words = []
    splitReview = description.split()
    for w in splitdescription:
        if w.isalpha() and w not in stopwords_list:
            clean_words.append(w.lower())
    clean_description = " ".join(clean_words)
    return clean_description

df["description"] = df["description"].apply(descriptionParse).apply(clean_description) #Parse all the description for their punctuation and add it into a new column

from sklearn.feature_extraction.text import TfidfVectorizer 
 
docs = list(df['description'])[:7000]
# settings that you use for count vectorizer will go here 
tfidf_vectorizer=TfidfVectorizer(use_idf=True, max_features = 20000) 
 
# just send in all your docs here 
tfidf_vectorizer_vectors=tfidf_vectorizer.fit_transform(docs)

nltk.download('vader_lexicon')
from nltk.sentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()
df['rating'] = tfidf_vectorizer_vectors.apply(lambda x: analyzer.polarity_scores(x))
df.tail(3)

df['sentiment'] = df['rating'].apply(lambda x: 'positive' if x >0 else 'neutral' if x==0 else 'negative')
df.head(4)

def sentiment(rating):
  if rating == 'positive':
    return 2
  elif rating == 'negative':
    return 0
  else:
    return 1  
df['sentiment'] = df['description'].apply(sentiment)

# Mostly 1's & didn't improve model performance when combined with airbnb_dummies,leave out for now.

In [None]:
# Verify before fitting model
airbnb_dummies_cleaned.isnull().sum()
airbnb_dummies_cleaned.dtypes

In [None]:
# Split data for model
# Splitting the data to X and y
X = airbnb_dummies_cleaned.drop('log_price', axis=1)
y = airbnb_dummies_cleaned.log_price

# Splitting data to train and test
X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=0.8,random_state=700)

# With scaler
scaler = StandardScaler() 
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [None]:
# Linear regression
linear_model_1 = LinearRegression()
linear_model_1.fit(X_train, y_train)

In [None]:
# Evaluate the model
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

y_test_predm1 = linear_model_1.predict(X_test)
y_train_predm1 = linear_model_1.predict(X_train)
print("Training MSE:", round(mean_squared_error(y_train, y_train_predm1),4))
print("Validation MSE:", round(mean_squared_error(y_test, y_test_predm1),4))
print("\nTraining r2:", round(r2_score(y_train, y_train_predm1),4))
print("Validation r2:", round(r2_score(y_test, y_test_predm1),4))

In [None]:
ax = sns.scatterplot(x=y_test, y=y_test_predm1)
ax.plot(y_test, y_test, 'r')
ax.set(title='Actual vs Predicted price linear regression')
plt.xlabel('Actual')
plt.ylabel('Predicted')

In [None]:
# Lasso model
from sklearn.linear_model import Lasso
alpha = np.linspace(0.01,0.4,10)
lasso = Lasso(alpha = alpha[i])
lasso.fit(X_train,y_train)
y_train_predm2 = lasso.predict(X_train)
y_test_predm2 = lasso.predict(X_test)
    print("Training MSE:", round(mean_squared_error(y_train, y_train_predm2),4))
    print("Validation MSE:", round(mean_squared_error(y_test, y_test_predm2),4))
    print("\nTraining r2:", round(r2_score(y_train, y_train_predm2),4))
    print("Validation r2:", round(r2_score(y_test, y_test_predm2),4))

In [None]:
ax = sns.scatterplot(x=y_test, y=y_test_predm1)
ax.plot(y_test, y_test, 'r')
ax.set(title='Actual vs Predicted price Lasso regression')
plt.xlabel('Actual')
plt.ylabel('Predicted')

In [None]:
# RandomForest, GradientBoosting
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# Running model random forest
rfm = RandomForestRegressor(
          max_depth = 10,
          n_jobs = -1, 
          n_estimators = 10
)

# Fit the model on training data
rfm.fit(X_train, y_train)

# Predict
y_train_predm2 = rfm.predict(X_train)

# Validate
y_test_predm2 = rfm.predict(X_test)

print("Training MSE:", round(mean_squared_error(y_train, y_train_predm2),4))
print("Validation MSE:", round(mean_squared_error(y_test, y_test_predm2),4))
print("\nTraining r2:", round(r2_score(y_train, y_train_predm2),4))
print("Validation r2:", round(r2_score(y_test, y_test_predm2),4))

In [None]:
ax = sns.scatterplot(x=y_test, y=y_test_predm2)
ax.plot(y_test, y_test, 'r')
ax.set(title='Actual vs Predicted price Random Forest')
plt.xlabel('Actual')
plt.ylabel('Predicted')

In [None]:
# Running model Gradient boosting
gbr = GradientBoostingRegressor()

# Fit the model on training data
gbr.fit(X_train, y_train)

# Predict
y_train_predm3 = gbr.predict(X_train)

# Validate
y_test_predm3 = gbr.predict(X_test)

print("Training MSE:", round(mean_squared_error(y_train, y_train_predm3),4))
print("Validation MSE:", round(mean_squared_error(y_test, y_test_predm3),4))
print("\nTraining r2:", round(r2_score(y_train, y_train_predm3),4))
print("Validation r2:", round(r2_score(y_test, y_test_predm3),4))

In [None]:
ax = sns.scatterplot(x=y_test, y=y_test_predm3)
ax.plot(y_test, y_test, 'r')
ax.set(title='Actual vs Predicted price gradient boosting regression')
plt.xlabel('Actual')
plt.ylabel('Predicted')

In [None]:
# KNN model
from sklearn.neighbors import KNeighborsRegressor

for num in range(1, 12):
    knn_model = KNeighborsRegressor(n_neighbors=num).fit(Xs_train, y_train)
    y_test_predm4 = knn_model.predict(Xs_test)
    y_train_predm4 = knn_model.predict(Xs_train)
    print("k = ", num)
    print("Training MSE:", round(mean_squared_error(y_train, y_train_predm4),4))
    print("Validation MSE:", round(mean_squared_error(y_test, y_test_predm4),4))
    print("\nTraining r2:", round(r2_score(y_train, y_train_predm4),4))
    print("Validation r2:", round(r2_score(y_test, y_test_predm4),4))

In [None]:
# Neural network model 1 sklearn
from sklearn.neural_network  import MLPRegressor
mlp = MLPRegressor(activation='relu', max_iter=1000)
mlp.fit(Xs_train, y_train)
y_test_predm5 = mlp.predict(Xs_test)
y_train_predm5 = mlp.predict(Xs_train)
print("Training MSE:", round(mean_squared_error(y_train, y_train_predm5),4))
print("Validation MSE:", round(mean_squared_error(y_test, y_test_predm5),4))
print("\nTraining r2:", round(r2_score(y_train, y_train_predm5),4))
print("Validation r2:", round(r2_score(y_test, y_test_predm5),4))

In [None]:
ax = sns.scatterplot(x=y_test, y=y_test_predm5)
ax.plot(y_test, y_test, 'r')
ax.set(title='Actual vs Predicted price neural network')
plt.xlabel('Actual')
plt.ylabel('Predicted')

In [None]:
# Neural network model 2 keras
from keras import models, layers, Input, Model
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import Activation
from keras.layers import BatchNormalization
from keras import optimizers
from keras import callbacks

In [None]:
nn2 = models.Sequential()
nn2.add(layers.Dense(128, input_shape=(X_train.shape[1],), activation='relu'))
nn2.add(layers.Dense(256, activation='relu'))
nn2.add(layers.Dense(128, activation='relu'))
nn2.add(layers.Dense(1, activation='linear'))
nn2.compile(loss='mean_squared_error',
            optimizer='adam',
            metrics=['mean_squared_error'])
X2 = np.asarray(X_train).astype('float32')
X3 = np.asarray(X_test).astype('float32')
nn2.fit(X2,
        y_train,
        epochs=100,
        batch_size=256,
        validation_split = 0.1)
y_test_predm6 = nn2.predict(X3)
y_train_predm6 = nn2.predict(X2)
print("Training MSE:", round(mean_squared_error(y_train, y_train_predm6),4))
print("Validation MSE:", round(mean_squared_error(y_test, y_test_predm6),4))
print("\nTraining r2:", round(r2_score(y_train, y_train_predm6),4))
print("Validation r2:", round(r2_score(y_test, y_test_predm6),4))

In [None]:
# Xgboost Regressor model
import xgboost as xgb
xgboost_model = xgb.XGBRegressor()

# Fit the model on training data
xgboost_model.fit(X_train, y_train)

# Applying k-Fold Cross Validation
kfold = KFold(n_splits=10)
results = cross_val_score(xgboost_model, X, y, cv=kfold)
print("Accuracy: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

# Predict
predict_train = xgboost_model.predict(X_train)

# Validate
predict_val = xgboost_model.predict(X_test)

print("\nTraining MSE:", round(mean_squared_error(y_train, predict_train),4))
print("Validation MSE:", round(mean_squared_error(y_test, predict_val),4))
print("\nTraining r2:", round(r2_score(y_train, predict_train),4))
print("Validation r2:", round(r2_score(y_test, predict_val),4))

ax = sns.scatterplot(x=y_test, y=predict_val)
ax.plot(y_test, y_test, 'r')
ax.set(title='Actual vs Predicted price XGBoost regression')
plt.xlabel('Actual')
plt.ylabel('Predicted')