In [1]:
#Business Understanding

#1. Can price of an Airbnb be predicted after eliminating missing values in the data?

#2. Can price of an Airbnb be better predicted after imputing the mean price for missing 
#   prices in the data?

#3. Can price of an Airbnb be better predicted after imputing the median price for missing 
#   prices in the data?

#4. What could be some reasons for the linear regression model operating so poorly?


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
%matplotlib inline

#Data Understanding
df = pd.read_csv('calendar.csv') #reads in the calendar data set
df

Unnamed: 0,listing_id,date,available,price
0,241032,2016-01-04,t,$85.00
1,241032,2016-01-05,t,$85.00
2,241032,2016-01-06,f,
3,241032,2016-01-07,f,
4,241032,2016-01-08,f,
...,...,...,...,...
1393565,10208623,2016-12-29,f,
1393566,10208623,2016-12-30,f,
1393567,10208623,2016-12-31,f,
1393568,10208623,2017-01-01,f,


In [2]:
#Prepare Data
drop_missing_dummy = df #creates a dummy dataframe to find median of price 

In [3]:
drop_missing_dummy

Unnamed: 0,listing_id,date,available,price
0,241032,2016-01-04,t,$85.00
1,241032,2016-01-05,t,$85.00
2,241032,2016-01-06,f,
3,241032,2016-01-07,f,
4,241032,2016-01-08,f,
...,...,...,...,...
1393565,10208623,2016-12-29,f,
1393566,10208623,2016-12-30,f,
1393567,10208623,2016-12-31,f,
1393568,10208623,2017-01-01,f,


In [4]:
#Prepare Data
drop_missing_dummy = drop_missing_dummy.dropna() #drops rows with missing values

In [5]:
drop_missing_dummy #displays drop_missing_dummy to examine that NaNs are gone

Unnamed: 0,listing_id,date,available,price
0,241032,2016-01-04,t,$85.00
1,241032,2016-01-05,t,$85.00
9,241032,2016-01-13,t,$85.00
10,241032,2016-01-14,t,$85.00
14,241032,2016-01-18,t,$85.00
...,...,...,...,...
1393207,10208623,2016-01-06,t,$87.00
1393208,10208623,2016-01-07,t,$87.00
1393211,10208623,2016-01-10,t,$87.00
1393212,10208623,2016-01-11,t,$87.00


In [6]:
#Prepare Data
def remove_dollar_signs(dollar): #gets rid of $ to make data more easy to manage
    if isinstance(dollar, str):
        return(dollar.replace('$', '').replace(',', ''))
    return(dollar)
drop_missing_dummy['price'] = drop_missing_dummy['price'].apply(remove_dollar_signs).astype('float')
df['price'] = df['price'].apply(remove_dollar_signs).astype('float')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [7]:
#Prepare Data
drop_missing_dummy['price'].median() #finds median of the price column

109.0

In [8]:
#Prepare Data
df['price'] = df['price'].fillna(drop_missing_dummy['price'].median()) #replaces missing prices with the median price

In [9]:
df

Unnamed: 0,listing_id,date,available,price
0,241032,2016-01-04,t,85.0
1,241032,2016-01-05,t,85.0
2,241032,2016-01-06,f,109.0
3,241032,2016-01-07,f,109.0
4,241032,2016-01-08,f,109.0
...,...,...,...,...
1393565,10208623,2016-12-29,f,109.0
1393566,10208623,2016-12-30,f,109.0
1393567,10208623,2016-12-31,f,109.0
1393568,10208623,2017-01-01,f,109.0


In [10]:
#Prepare Data
encode_df = df
encode_df['year'] = pd.DatetimeIndex(encode_df['date']).year #strips year and month only from the date column
encode_df['month'] = pd.DatetimeIndex(encode_df['date']).month
encode_df.replace(['f','t'], [0,1], inplace = True) #replaces t with 1

In [11]:
encode_df

Unnamed: 0,listing_id,date,available,price,year,month
0,241032,2016-01-04,1,85.0,2016,1
1,241032,2016-01-05,1,85.0,2016,1
2,241032,2016-01-06,0,109.0,2016,1
3,241032,2016-01-07,0,109.0,2016,1
4,241032,2016-01-08,0,109.0,2016,1
...,...,...,...,...,...,...
1393565,10208623,2016-12-29,0,109.0,2016,12
1393566,10208623,2016-12-30,0,109.0,2016,12
1393567,10208623,2016-12-31,0,109.0,2016,12
1393568,10208623,2017-01-01,0,109.0,2017,1


In [12]:
#Prepare Data
encode_df = encode_df.drop(columns=['listing_id', 'date', 'available']) #removes the old date column, listing_id column, and available column

In [13]:
encode_df

Unnamed: 0,price,year,month
0,85.0,2016,1
1,85.0,2016,1
2,109.0,2016,1
3,109.0,2016,1
4,109.0,2016,1
...,...,...,...
1393565,109.0,2016,12
1393566,109.0,2016,12
1393567,109.0,2016,12
1393568,109.0,2017,1


In [14]:
#Data Modeling
def linear_model(encode_df, test_size=.3, rand_state=42): #function to split explanatory and response variables, split test/train, and predict price of home
    
    X = encode_df.drop(['price'], axis=1)
    y = encode_df['price']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state=42) 

    lm_model = LinearRegression(normalize=True) # Instantiate
    lm_model.fit(X_train, y_train) #Fit
        
    #Predict and score the model
    y_test_preds = lm_model.predict(X_test) 
    y_train_preds = lm_model.predict(X_train)
    "The r-squared score for the model using only quantitative variables was {} on {} values.".format(r2_score(y_test, y_test_preds), len(y_test))
    test_score = r2_score(y_test, y_test_preds)
    train_score = r2_score(y_train, y_train_preds)

    return test_score, train_score, lm_model, X_train, X_test, y_train, y_test


#Test your function with the above dataset
test_score, train_score, lm_model, X_train, X_test, y_train, y_test = linear_model(encode_df, 'price')

In [15]:
#Evaluate Results
#Print training and testing score
print("The rsquared on the training data was {}.  The rsquared on the test data was {}.".format(train_score, test_score))

The rsquared on the training data was 0.001785930759599319.  The rsquared on the test data was 0.0019697451271587507.
