# Airbnb price prediction in NewYork City

## Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

## Loading and exploring Dataset

data used in this notebook is publically avaiable at [website](http://insideairbnb.com/get-the-data.html).
We have used data only for new york city of year 2019

In [None]:
airbnb_df =pd.read_csv('AB_NYC_2019.csv')

In [None]:
airbnb_df.head()

In [None]:
airbnb_df.describe()

As expected none of the value for price is negative. Hence, there is no issue in data collection. We can see that mean price is 152 dollars and median price is 106 dollars

### Check for the null values in each column

In [None]:
airbnb_df.info()

In [None]:
airbnb_df.isnull().sum()

As we can see above there are some missing data for some features. We will address this later.

## Data Visualization

In [None]:
plt.figure(figsize=(10,10))
sns.countplot(airbnb_df['neighbourhood_group'], palette="viridis")
plt.title('Neighbourhood Group')

This Shows that most of Airbnb Listings in New York are near Brooklyn and Manhattan

In [None]:
plt.figure(figsize=(10,10))
sns.countplot(airbnb_df['room_type'], palette="viridis")
plt.title('Room type')

Plot shows that Entire Home/Apartment are listed most near Manhattan while Private Rooms and Apartments Near Brooklyn are Nearly Equal

In [None]:
plt.figure(figsize=(10,10))
sns.countplot(airbnb_df['neighbourhood_group'],hue=airbnb_df['room_type'], palette="viridis")
plt.title('Room type')

In [None]:
def create_price_plot(airbnb_df, type_room):
	num_airbnb_df=airbnb_df._get_numeric_data()
	num_airbnb_df.replace([np.inf, -np.inf], np.nan).dropna(axis=1)# replaced all values of Infinity with Nan

	# print(num_airbnb_df.dtypes)# checking datatypes of  numeric columns

	# print("------------------------------------------------")
	# print(num_airbnb_df.isnull().sum()) # This shows that oly reviews_per_month had missing values so we have to fill it up

	#replacing all NaN values in 'reviews_per_month' with 0
	num_airbnb_df.fillna({'reviews_per_month':0}, inplace=True)

	#Brooklyn
	subset_brooklyn=airbnb_df.loc[(airbnb_df['neighbourhood_group'] == 'Brooklyn') & (airbnb_df['room_type']==type_room)]
	price_sub1=num_airbnb_df['price'].iloc[subset_brooklyn.index]# prices for Neighbourhood group having Brooklyn
	percentile_price_brooklyn=[]


	#Bronx
	subset_bronx=airbnb_df.loc[(airbnb_df['neighbourhood_group'] == 'Bronx') & (airbnb_df['room_type']==type_room)]
	price_sub2=num_airbnb_df['price'].iloc[subset_bronx.index]# prices for Neighbourhood group having Bronx
	percentile_price_Bronx=[]

	#Staten Island
	subset_staten_island=airbnb_df.loc[(airbnb_df['neighbourhood_group'] == 'Staten Island') & (airbnb_df['room_type']==type_room)]
	price_sub3=num_airbnb_df['price'].iloc[subset_staten_island.index]# prices for Neighbourhood group having Staten Island
	percentile_price_Staten_Island=[] 

	#Queens
	subset_queens=airbnb_df.loc[(airbnb_df['neighbourhood_group'] == 'Queens') & (airbnb_df['room_type']==type_room)]
	price_sub4=num_airbnb_df['price'].iloc[subset_queens.index]# prices for Neighbourhood group having Queens
	percentile_price_Queens=[]


	#Manhattan
	subset_manhattan=airbnb_df.loc[(airbnb_df['neighbourhood_group'] == 'Manhattan') & (airbnb_df['room_type']==type_room)]
	price_sub5=num_airbnb_df['price'].iloc[subset_manhattan.index]# prices for Neighbourhood group having Manhattan
	percentile_price_Manhattan=[]

	percentiles=[]# percentiles
	for i in range(25,91):
	    percentile_price_brooklyn.append(int(price_sub1.quantile(i/100)))
	    percentile_price_Bronx.append(int(price_sub2.quantile(i/100)))
	    percentile_price_Staten_Island.append(int(price_sub3.quantile(i/100)))
	    percentile_price_Queens.append(int(price_sub4.quantile(i/100)))
	    percentile_price_Manhattan.append(int(price_sub5.quantile(i/100)))
	    percentiles.append(i)   

	plt.title('Prices of '+type_room,fontsize=15,color='Red')
	sns.set_style("darkgrid")

	# for i in range(2):

	sd=sns.lineplot(x=pd.Series(percentiles),y=pd.Series(percentile_price_brooklyn),label='Brooklyn')
	sd=sns.lineplot(x=pd.Series(percentiles),y=pd.Series(percentile_price_Bronx),label='Bronx')
	sd=sns.lineplot(x=pd.Series(percentiles),y=pd.Series(percentile_price_Staten_Island),label='Staten Island')
	sd=sns.lineplot(x=pd.Series(percentiles),y=pd.Series(percentile_price_Queens),label='Queens')
	sd=sns.lineplot(x=pd.Series(percentiles),y=pd.Series(percentile_price_Manhattan),label='Manhattan')

	sd.set(xlabel='Percentiles', ylabel='Percentile Prices in U.S $')

In [None]:
create_price_plot(airbnb_df, 'Shared room')

Above plot shows the comparison of prices of Shared Rooms, this Shows getting a shared room is cheapest near Bronx and costliest near Manhattan

In [None]:
create_price_plot(airbnb_df, 'Private room')

In [None]:
create_price_plot(airbnb_df, 'Entire home/apt')

### Map of New York

<img src="nyc_map.png">

In [None]:
plt.figure(figsize=(10,6))
sns.scatterplot(airbnb_df.longitude,airbnb_df.latitude,hue=airbnb_df.neighbourhood_group)
plt.ioff()

The above plot shows the count of available spots in different neighborhoods. We can see most of AirBnb are in Brooklyn and Manhattan.

In [None]:
plt.figure(figsize=(10,6))
sns.scatterplot(airbnb_df.longitude,airbnb_df.latitude,hue=airbnb_df.room_type)
plt.ioff()

The above plot shows the roomtype of different Airbnb spots in NYC.

In [None]:
plt.figure(figsize=(10,6))
sns.scatterplot(airbnb_df.longitude,airbnb_df.latitude,hue=airbnb_df.availability_365,palette="viridis")
plt.ioff()

## Modelling

#### Drop unnecessary columns

In [None]:
airbnb_df.drop(['host_id','host_name','latitude','longitude','neighbourhood','last_review'], axis=1, inplace=True)
#examing the changes
airbnb_df.head(5)

#### Encode categorical columns

In [None]:
airbnb_en_df = airbnb_df.copy()
for column in airbnb_en_df.columns[airbnb_en_df.columns.isin(['neighbourhood_group', 'room_type'])]:
    airbnb_en_df[column] = airbnb_en_df[column].factorize()[0]
airbnb_en_df.head()

In [None]:
mean = airbnb_en_df['reviews_per_month'].mean()
airbnb_en_df['reviews_per_month'].fillna(mean, inplace=True)
airbnb_en_df.isnull().sum()

In [None]:
airbnb_en_df = airbnb_en_df[airbnb_en_df['name'].notna()]
airbnb_en_df.isnull().sum()

#### Multicollinearity

In [None]:
corr = airbnb_en_df.corr(method='kendall')
plt.figure(figsize=(18,12))
sns.heatmap(corr, annot=True)

### Multiple Linear Regression

In [None]:
multicollinearity, V=np.linalg.eig(corr)
multicollinearity

None one of the eigenvalues of the correlation matrix is close to zero. It means that there is no multicollinearity exists in the data.

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error

In [None]:
x = airbnb_en_df.iloc[:,[2,3,5,6,7,8,9]]
y = airbnb_en_df['price']
#Getting Test and Training Set
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=.1,random_state=123)

In [None]:
reg=LinearRegression(normalize = True)
reg.fit(x_train,y_train)

In [None]:
y_pred=reg.predict(x_test)
r2_score(y_test,y_pred)

#### ummm, why not a good fit ?

In [None]:
from scipy.stats import norm
plt.figure(figsize=(10,10))
sns.distplot(airbnb_df['price'], fit=norm)
plt.title("Price Distribution Plot",size=15, weight='bold')

In [None]:
airbnb_en_df['price_log'] = np.log(airbnb_en_df.price+1)
sns.distplot(airbnb_en_df['price_log'], fit=norm)

In [None]:
x = airbnb_en_df.iloc[:,[2,3,5,6,7,8,9]]
y = airbnb_en_df['price_log']
#Getting Test and Training Set
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=.1,random_state=353)

In [None]:
reg=LinearRegression(normalize = True)
reg.fit(x_train,y_train)

In [None]:
y_pred=reg.predict(x_test)

In [None]:
from sklearn.metrics import r2_score
r2_score(y_test,y_pred)

### Polynomial Regression

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
degree=2
polyreg=make_pipeline(PolynomialFeatures(degree),LinearRegression())
polyreg.fit(x_train,y_train)

In [None]:
y_pred=polyreg.predict(x_test)

In [None]:
from sklearn.metrics import r2_score
r2_score(y_test,y_pred)

## Conclusion
In this notebook, I try to make predictions with different Regression models. In the next workshop, we will use large dataset to illustrate classification algorithms.