# Linear Regression on Airbnb Dataset
This is a dataset of AirBnb having 29 columns. Aim of this linear regression is to predict the price of room from given features.

# Load Libraries

In [None]:
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
import seaborn as sns
import statsmodels.api as sm
import statsmodels as statm
from statsmodels.formula.api import ols
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from plotnine import *


# Load File

In [None]:
filedata= pd.read_csv('train.csv')

# Exploratory Data Analysis(EDA)

## File structure and content

In [None]:
filedata.head(5)

In [None]:
len(filedata.columns)

In [None]:
filedata.shape

In [None]:
filedata.info()

In [None]:
#check for missing data, and output columns that have missing data
for col in filedata:
    if (filedata[col].isnull().any()):
        print(col)

In [None]:
#fills missing data with 0s
#GO BACK TO THIS, 0 may not be best fill for all missing data
filedata=filedata.fillna(0)

In [None]:
#summary stats on each of the numeric columns
filedata.describe()

In [None]:
#check all the statistics
filedata.describe(include='all')

In [None]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64','uint8']
numericdataX = filedata.select_dtypes(include=numerics)
x= numericdataX['accommodates']
sns.distplot(x);

In [None]:
x= numericdataX.iloc[:,1]
sns.distplot(x);

In [None]:
ggplot(filedata, aes(x='room_type')) + geom_bar(fill = "red")

In [None]:
ggplot(filedata, aes(x='city')) + geom_bar(fill = "green")

In [None]:
#check categorical data
filedata.describe(include=['O'])

In [None]:
#check numeric data
filedata.describe()

In [None]:
filedata.columns

# Linear Assumptions

## Assumption that data point are linearly disributed 

In [None]:
regressor = linear_model.LinearRegression()
for i in range(1,10): 
    x= np.array(numericdataX.iloc[:,i]).reshape(-1,1)
    y= np.array(filedata['log_price']).reshape(-1,1)
    regressor.fit(x,y)
    plt.figure(figsize=(8,5))
    plt.subplot(10,1,i)
    plt.scatter(x,y,color='blue', alpha=0.1)
    plt.plot(x,regressor.predict(x),color="red")
    plt.legend()

## Q-Q plot


In [None]:
statm.graphics.gofplots.qqplot(numericdataX.iloc[:,6], line='r')

In [None]:
statm.graphics.gofplots.qqplot(numericdataX.iloc[:,1], line='r')

In [None]:
statm.graphics.gofplots.qqplot(numericdataX.iloc[:,9], line='r')

# Linear Regression

### Functions

In [None]:
def checkCorrelation(data):
    """
    Plot correlation Matrix for given data
   :param data: dataset having features
   :return: return plot representing pearson correlation
   """
    plt.figure(figsize=(20, 20))
    sns.heatmap(data.corr(),linewidths=0.25,vmax=1.0,square=True,cmap="BuGn_r", 
    linecolor='w',annot=True)

In [None]:
#return Model 
def data_model(xdata):
    """
     fits linear regression model on given data
    :param xdata: independent variable dataset
    :return: linear regression model with fit of xdata 
   """
    #add constant to data
    X = sm.add_constant(xdata)
    targetY=filedata[['log_price']]
    y = targetY

    # Fit the linear model
    model = linear_model.LinearRegression()
    results = model.fit(X, y)
    model = sm.OLS(y, X)
    results = model.fit()
    return results

In [None]:
def data_summary(xdata):
    """
    Returns chart having summary of data
   :param xdata: independent variable dataset
   :return: summary of data 
   """
    results = data_model(xdata)
    return results.summary()

In [None]:
def crossValidationError(data):
    """
   Finds cross validation error of model
   :param X: independent variable dataset
   :return: float value returns mean squared error
   """
    numericdataX=data
    X = np.array(numericdataX.drop(['log_price'],axis=1), dtype=pd.Series)
    Y = np.array(numericdataX['log_price'], dtype=pd.Series)
    regr1 = linear_model.LinearRegression()
    ms_errors= cross_val_score(regr1, X, Y, cv=5, scoring = make_scorer(mean_squared_error))
    rms_errors = np.sqrt(ms_errors)
    mean_rms_error = rms_errors.mean()
    return mean_rms_error

In [None]:
#Checking correlation in data
checkCorrelation(numericdataX)

In [None]:
#So as per correlation matrix colums such as latitude, longitude, number_of_reviews and review_scores_rating are not making much impact on log_price
#as valueof cirrelation is poor
#lets drop them from our dataset
numericdataX=numericdataX.drop(['id','number_of_reviews',
       'review_scores_rating','latitude',
       'longitude' ], axis=1)

In [None]:
# buid model and check summary
data_summary(numericdataX)

In [None]:
# there is also correlation between bathroom and accomodates and bedroom lets only keep acomodates
numericdataX = numericdataX.drop(['bathrooms','bedrooms','beds'], axis=1)

In [None]:
# buid model and check summary
data_summary(numericdataX)

In [None]:
crossValidationError(numericdataX)

## Handling Categorical Variables

In [None]:
filedata.room_type.value_counts()

In [None]:
#creating dummy variable for column room_type
numericdataX=pd.concat([numericdataX,filedata['room_type']], axis=1)
numericdataX=pd.get_dummies(numericdataX,columns= ['room_type'],drop_first=True)

In [None]:
numericdataX

In [None]:
filedata.bed_type.value_counts()

In [None]:
numericdataX=pd.concat([numericdataX,filedata['bed_type']], axis=1)
numericdataX=pd.get_dummies(numericdataX,columns=['bed_type'],drop_first=True)

In [None]:
filedata.cancellation_policy.value_counts()

In [None]:
numericdataX=pd.concat([numericdataX,filedata['cancellation_policy']], axis=1)
numericdataX=pd.get_dummies(numericdataX,columns=['cancellation_policy'],drop_first=True)

In [None]:
filedata.city.value_counts()

In [None]:
numericdataX=pd.concat([numericdataX,filedata['city']], axis=1)
numericdataX=pd.get_dummies(numericdataX,columns=['city'],drop_first=True)

In [None]:
filedata.instant_bookable.value_counts()
numericdataX=pd.concat([numericdataX,filedata['instant_bookable']], axis=1)
numericdataX=pd.get_dummies(numericdataX,columns=['instant_bookable'],drop_first=True)

# Multicolinearity

In [None]:
checkCorrelation(numericdataX)

In [None]:
data_summary(numericdataX.drop(['log_price'],axis=1))

In [None]:
filedata.property_type.value_counts()
numericdataX=pd.concat([numericdataX,filedata['property_type']], axis=1)
numericdataX=pd.get_dummies(numericdataX,columns=['property_type'],drop_first=True)

In [None]:
data_summary(numericdataX)

In [None]:
crossValidationError(numericdataX)

In [None]:
# P value of bed type has poor P value
numericdataX = numericdataX.loc[:, ~numericdataX.columns.str.startswith('bed_type_')]

In [None]:
data_summary(numericdataX.drop(['log_price'],axis=1))

In [None]:
crossValidationError(numericdataX)

# Interaction Term

In [None]:
filedata.columns

In [None]:
interactionDF= pd.DataFrame()
interactionDF['bedrooms']=filedata['bedrooms']
interactionDF['beds']=filedata['beds']
interactionDF['bathrooms']=filedata['bathrooms']
interactionDF['bed*bathroom*bedrooms']=filedata['bedrooms']*filedata['beds']*filedata['bathrooms']
data_summary(interactionDF)

In [None]:
numericdataX= pd.concat([numericdataX,interactionDF],axis=1)
data_summary(numericdataX)

In [None]:
interactionDF1= pd.DataFrame()
interactionDF1['review_scores_rating']=filedata['review_scores_rating']
interactionDF1['number_of_reviews']=filedata['number_of_reviews']
interactionDF1['reiew_score*Number']=filedata['review_scores_rating']*filedata['number_of_reviews']
data_summary(interactionDF1)

In [None]:
numericdataX= pd.concat([numericdataX,interactionDF1],axis=1)

In [None]:
data_summary(numericdataX)

In [None]:
crossValidationError(numericdataX)

## As we can see above tha by adding an interaction term our cross validation error gets lower and we are getting perfect R2 ,Aic and Bic values 
## This is a model we were looking for

## Thanks You!!!!! 

## Lets start with H20

In [None]:

##import h2o
##from h2o.automl import H2OAutoML
##h2o.init()

In [None]:
numericdataX

## Classification with Trees

## A.Classification with Logistic Regression
## B. Classification with Trees
               1. Bagging based tree algorithm (Random Forest)
               2. Boosting based tree algorithm (GradientBoosting)


## A.Classification with Logistic Regression

In [None]:
filedata

In [None]:
mean_log= np.mean(numericdataX['log_price'])

In [None]:
classificationData= numericdataX

In [None]:
classificationData.loc[ classificationData['log_price'] <= mean_log, 'log_price'] = 0

In [None]:
classificationData.loc[ classificationData['log_price'] > mean_log, 'log_price'] = 1

In [None]:
classificationDataY= classificationData['log_price']
classificationDataX=classificationData.drop(['log_price'],axis=1)

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(classificationDataX, classificationDataY, test_size = 0.2,random_state=0)

In [None]:
classifier= LogisticRegression()

In [None]:
classifier.fit(X_train,y_train)

In [None]:
y_pred = classifier.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(classifier.score(X_test, y_test)))

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)

6127+ 5905 correct predictions and 1468+1323 incorrect predictions.

Compute precision, recall, F-measure and support

The precision is the ratio tp / (tp + fp) where tp is the number of true positives and fp the number of false positives. The precision is intuitively the ability of the classifier to not label a sample as positive if it is negative.

The recall is the ratio tp / (tp + fn) where tp is the number of true positives and fn the number of false negatives. The recall is intuitively the ability of the classifier to find all the positive samples.

The F-beta score can be interpreted as a weighted harmonic mean of the precision and recall, where an F-beta score reaches its best value at 1 and worst score at 0.

The F-beta score weights the recall more than the precision by a factor of beta. beta = 1.0 means recall and precision are equally important.

The support is the number of occurrences of each class in y_test.

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

Interpretation: Of the entire test set, 81% price wa predicted properly

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

def rocAucCurve(classifier):
    logit_roc_auc = roc_auc_score(y_test, classifier.predict(X_test))
    fpr, tpr, thresholds = roc_curve(y_test, classifier.predict_proba(X_test)[:,1])
    plt.figure()
    plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic')
    plt.legend(loc="lower right")
    plt.savefig('Log_ROC')
    plt.show()

In [None]:
rocAucCurve(classifier)

The receiver operating characteristic (ROC) curve is another common tool used with binary classifiers. 
The dotted line represents the ROC curve of a purely random classifier;
a good classifier stays as far away from that line as possible (toward the top-left corner).

## B. Classification with Trees
The best way to think about hyperparameters is like the settings of an algorithm that can be adjusted to optimize performance,
 Hyperparameter tuning relies more on experimental results than theory, and thus the best method to determine the optimal settings is to try many different combinations evaluate the performance of each model.
 Hyper Parameter
  - n_estimators = number of trees in the foreset
  - max_features = max number of features considered for splitting a node
  - max_depth = max number of levels in each decision tree
  - min_samples_split = min number of data points placed in a node before the node is split
  - min_samples_leaf = min number of data points allowed in a leaf node
  - bootstrap = method for sampling data points (with or without replacement)

 1. Bagging based tree algorithm (Random Forest)

In [None]:
from sklearn.ensemble import RandomForestClassifier  
classifierDT = RandomForestClassifier()  
classifierDT.fit(X_train, y_train) 

In [None]:
y_pred = classifierDT.predict(X_test)  

Evaluating the Algorithm

In [None]:
from sklearn.metrics import classification_report, confusion_matrix  
print(confusion_matrix(y_test, y_pred))  
print(classification_report(y_test, y_pred))  

75% correct Prediction

In [None]:
rocAucCurve(classifierDT)

### Hyper Parameters Tuning

Random Hyperparameter Grid
To use RandomizedSearchCV, we first need to create a parameter grid to sample from during fitting:
    [More Details](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html)

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from pprint import pprint

In [None]:
#Let us see what default parameters our model used
print('Parameters currently in use:\n')
pprint(classifierDT.get_params())

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4],
    'min_samples_split': [8, 10],
    'n_estimators': [100, 200]
}
# Create a based model
rf = RandomForestClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [None]:
# Fit the grid search to the data
grid_search.fit(X_train, y_train)
grid_search.best_params_

In [None]:
grid_search.best_params_

In [None]:
random1=RandomForestClassifier(n_estimators=200,max_depth=90, min_samples_split=8, min_samples_leaf=3, max_features=3,bootstrap=True)

In [None]:
random1.fit(X_train,y_train)
y_pred = random1.predict(X_test)  
print(confusion_matrix(y_test, y_pred))  

In [None]:
print(classification_report(y_test, y_pred)) 
rocAucCurve(random1)

Here after Tuning the parameters we got AUC as 0.81 as compared to 
Random Forest not Tuned Parameter

 ### 2. Boosting based tree algorithm (GradientBoosting)
 Lets consider another set of parameters for managing boosting:

- learning_rate
This determines the impact of each tree on the final outcome (step 2.4). GBM works by starting with an initial estimate which is updated using the output of each tree. The learning parameter controls the magnitude of this change in the estimates.
Lower values are generally preferred as they make the model robust to the specific characteristics of tree and thus allowing it to generalize well.
Lower values would require higher number of trees to model all the relations and will be computationally expensive.
- n_estimators
The number of sequential trees to be modeled (step 2)
Though GBM is fairly robust at higher number of trees but it can still overfit at a point. Hence, this should be tuned using CV for a particular learning rate.
subsample
The fraction of observations to be selected for each tree. Selection is done by random sampling.
Values slightly less than 1 make the model robust by reducing the variance.
Typical values ~0.8 generally work fine but can be fine-tuned further.
Apart from these, there are certain miscellaneous parameters which affect overall functionality:

- loss
It refers to the loss function to be minimized in each split.
It can have various values for classification and regression case. Generally the default values work fine. Other values should be chosen only if you understand their impact on the model.
- init
This affects initialization of the output.
This can be used if we have made another model whose outcome is to be used as the initial estimates for GBM.
- random_state
The random number seed so that same random numbers are generated every time.
This is important for parameter tuning. If we don’t fix the random number, then we’ll have different outcomes for subsequent runs on the same parameters and it becomes difficult to compare models.
It can potentially result in overfitting to a particular random sample selected. We can try running models for different random samples, which is computationally expensive and generally not used.
- verbose
The type of output to be printed when the model fits. The different values can be:
0: no output generated (default)
1: output generated for trees in certain intervals >1: output generated for all trees
- warm_start
This parameter has an interesting application and can help a lot if used judicially.
Using this, we can fit additional trees on previous fits of a model. It can save a lot of time and you should explore this option for advanced applications
- presort 
 Select whether to presort data for faster splits.
It makes the selection automatically by default but it can be changed if needed.




In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)

In [None]:
y_pred = gb.predict(X_test)  
print(confusion_matrix(y_test, y_pred))

In [None]:
print(classification_report(y_test, y_pred)) 
rocAucCurve(gb)

In [None]:
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
    gb = GradientBoostingClassifier(n_estimators=20, learning_rate = learning_rate, max_features=2, max_depth = 2, random_state = 0)
    gb.fit(X_train, y_train)
    print("Learning rate: ", learning_rate)
    print("Accuracy score (training): {0:.3f}".format(gb.score(X_train, y_train)))
    print("Accuracy score (validation): {0:.3f}".format(gb.score(X_test, y_test)))
    print()

In [None]:
#Learning rate 1  is good 
gb_op = GradientBoostingClassifier(n_estimators=20, learning_rate = 0.75, max_features=2, max_depth = 2, random_state = 0)
gb_op.fit(X_train,y_train)

In [None]:
y_pred = gb_op.predict(X_test)  
print(confusion_matrix(y_test, y_pred))

In [None]:
print(classification_report(y_test, y_pred)) 
rocAucCurve(gb_op)

## Comparison of Classification Models
Lets Compare  Precision and AUC 
1. Logistic Regression
  - Precision : 81%
  - AUC : 0.81
2. Random Forest
  - Precision : 81%
  - AUC : 0.81
3. Gradient Boosting 
 - Precision : 77%
 - AUC : 0.77

As per the comparison Logistic Regression and Random Forest Works well on given Model for classification

## Linear Regression

### Regression with Trees

In [None]:
Y= filedata['log_price']

In [None]:
numericdataX= numericdataX.drop(['log_price'],axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(numericdataX,Y, test_size=0.2, random_state=0)  

In [None]:
from sklearn.ensemble import RandomForestRegressor  
regressor = RandomForestRegressor()  
regressor.fit(X_train, y_train)  

In [None]:
y_pred = regressor.predict(X_test) 

In [None]:
df=pd.DataFrame({'Actual':y_test, 'Predicted':y_pred})  
df


Evaluating the Algorithm
To evaluate performance of the regression algorithm, the commonly used metrics are mean absolute error, mean squared error, and root mean squared error. The Scikit-Learn library contains functions that can help calculate these values for us. To do so, use this code from the metrics package:


In [None]:
def regression_Metrics(y_test, y_pred):  
    from sklearn import metrics  
    print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
    print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
    print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

In [None]:
regression_Metrics(y_test,y_pred)

## Hyper Parameter Tuning

In [None]:
#Let us see what default parameters our model used
print('Parameters currently in use:\n')
pprint(regressor.get_params())

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4],
    'min_samples_split': [8, 10],
    'n_estimators': [100, 200]
}
# Create a based model
rf = RandomForestRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [None]:
# Fit the grid search to the data
grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_params_

In [None]:
regressor1=RandomForestRegressor(n_estimators=200,max_depth=90, min_samples_split=10, min_samples_leaf=10, max_features=3,bootstrap=True)

In [None]:
regressor1.fit(X_train,y_train)
y_pred = regressor1.predict(X_test) 

In [None]:
regression_Metrics(y_test,y_pred)

## Boosting based tree algorithm (GradientBoosting)

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
gb = GradientBoostingRegressor()
gb.fit(X_train, y_train)

In [None]:
y_pred = gb.predict(X_test)  

In [None]:
regression_Metrics(y_test,y_pred)

In [None]:
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
    gb = GradientBoostingRegressor(n_estimators=20, learning_rate = learning_rate, max_features=2, max_depth = 2, random_state = 0)
    gb.fit(X_train, y_train)
    print("Learning rate: ", learning_rate)
    print("Accuracy score (training): {0:.3f}".format(gb.score(X_train, y_train)))
    print("Accuracy score (validation): {0:.3f}".format(gb.score(X_test, y_test)))
    print()

In [None]:
gb_op = GradientBoostingRegressor(n_estimators=20, learning_rate = 1, max_features=2, max_depth = 2, random_state = 0)
gb_op.fit(X_train,y_train)
y_pred = gb_op.predict(X_test)

In [None]:
regression_Metrics(y_test,y_pred)

## Comparison of different Approaches:

1.  Linear Regression:
Root Mean Squared Error: 0.4721355501041085

2.  Linear Regresssion with trees:
    - Random Forest
        -  Mean Absolute Error: 0.3681282451600619
        - Mean Squared Error: 0.2388446162343491
        - Root Mean Squared Error: 0.48871731730556583

    -  Boosting 
         - Mean Absolute Error: 0.39340469697589975
         - Mean Squared Error: 0.2670819491835599
         - Root Mean Squared Error: 0.5167997186372685



As per the comparison Regression with linear regression works well on given Model