In [1]:
#import modules
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import r2_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
import warnings
warnings.filterwarnings('ignore')

### Load the dataset

- Load the train data and using all your knowledge of pandas try to explore the different statistical properties like correlation of the dataset.

In [2]:
# Code starts here
train = pd.read_csv('train.csv')
print(train.head(5))
print("The shape of the data is:",train.shape)
print("Statistical properties of the data is:")
print(train.describe())
print("Correlation of the data is:")
print(train.corr())
# Code ends here.

     Id  Rooms  Type    Price  Method  SellerG  Distance  Postcode  Bathroom  \
0   124      3     2   995000       1      135       6.3      3143         2   
1  5905      2     0  1080000       1      155       2.0      3066         1   
2  5963      3     0   725000       1      196      17.9      3082         2   
3  3521      4     0  1330000       1      251       8.0      3016         3   
4  3738      3     0   620000       1      221      12.4      3060         1   

   Car  Landsize  BuildingArea  YearBuilt  CouncilArea  Longtitude  \
0    2         0         100.0       1998           26   145.01600   
1    0         0          94.0       1890           31   144.98863   
2    2       602         129.0       1980           29   145.06912   
3    2       217         266.0       2003           10   144.87960   
4    2       604         190.0       1970           23   144.96860   

   Regionname  Propertycount  
0           5           4836  
1           2           4553  
2    

## Model building

- Separate the features and target and then split the train data into train and validation set.
- Apply different models of your choice and  then predict on the validation data and find the `accuracy_score` for this prediction.
- Try improving upon the `accuracy_score` using different regularization techniques.

In [3]:
# Code starts here
# Split into feature and target
#train.drop('Id',axis=1,inplace=True)
x = train.drop('Price', axis=1)
y = train['Price']
x_train,x_val,y_train,y_val = train_test_split(x,y,test_size = 0.1, random_state = 8)

regressor = LinearRegression()
regressor.fit(x_train,y_train)
y_pred = regressor.predict(x_val)

r2 = r2_score(y_val,y_pred)
print("r2 for linear regression is:",r2)

# Code ends here.

r2 for linear regression is: 0.7202581063053639


In [4]:
# instantiate lasso model
lasso = Lasso()

# fit and predict
lasso.fit(x_train,y_train)
lasso_pred = lasso.predict(x_val)

#print(lasso_pred)
r2_lasso = r2_score(y_val,lasso_pred)
print(r2_lasso)

0.7202576425557382


In [5]:
# instantiate ridge model
ridge = Ridge()

# fit and predict
ridge.fit(x_train,y_train)
ridge_pred = ridge.predict(x_val)

#print(ridge_pred)
r2_ridge = r2_score(y_val,ridge_pred)
print(r2_ridge)

0.7201192177706333


In [6]:
# make pipeline for second degree polynomialfeatures
polynomial_model = make_pipeline(PolynomialFeatures(2), LinearRegression())

# Fit the model on training set
polynomial_model.fit(x_train,y_train)

# predict the model performance
y_pred_poly = polynomial_model.predict(x_val)
#print(y_pred)

r2_poly = r2_score(y_val, y_pred_poly)
print(r2_poly)

0.7832482039057346


### Prediction on the test data and creating the sample submission file.

- Load the test data and store the `Id` column in a separate variable.
- Perform the same operations on the test data that you have performed on the train data.
- Create the submission file as a `csv` file consisting of the `Id` column from the test data and your prediction as the second column.

In [9]:
# Code starts here
test = pd.read_csv('test.csv')
print(test.shape)
Id = test['Id']
# Applying the same transformation on test data
#test.drop('Id',axis=1,inplace=True)

# Predict on the test data using the best model
y_pred_test = polynomial_model.predict(test)

# Create a sample submission file
final_submission = pd.DataFrame({'Id':Id,'Price':y_pred_test})
final_submission = final_submission.astype(int)
# Convert the sample submission file into a csv file
final_submission.to_csv('final_submission.csv',index=False)

# Code ends here.

(2049, 16)
