### Importing the libraries and datasets

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from scipy import stats
from scipy.stats import skew,norm
from scipy.stats.stats import pearsonr

import warnings
warnings.filterwarnings('ignore')

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
test2 = pd.read_csv('sample_submission.csv')

In [2]:
train.fillna(method='ffill', inplace=True)
test.fillna(method='ffill', inplace=True)

In [3]:
#Creating a dataframe containing only numeric columns
train = train.select_dtypes(include = ['number'])

In [4]:
#Creating a dataframe containing only numeric columns
test = test.select_dtypes(include = ['number'])

In [5]:
y_train = train.iloc[:,-1].values

In [6]:
train.drop("SalePrice", axis = 1, inplace = True)

In [7]:
train.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
0,1,60,65.0,8450,7,5,2003,2003,196.0,706,...,548,0,61,0,0,0,0,0,2,2008
1,2,20,80.0,9600,6,8,1976,1976,0.0,978,...,460,298,0,0,0,0,0,0,5,2007
2,3,60,68.0,11250,7,5,2001,2002,162.0,486,...,608,0,42,0,0,0,0,0,9,2008
3,4,70,60.0,9550,7,5,1915,1970,0.0,216,...,642,0,35,272,0,0,0,0,2,2006
4,5,60,84.0,14260,8,5,2000,2000,350.0,655,...,836,192,84,0,0,0,0,0,12,2008


In [8]:
# Feature Scaling the dataframe
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
train1 = sc.fit_transform(train)
test1 = sc.fit_transform(test)
y_train1 = sc.fit_transform(y_train.reshape(-1,1))

In [9]:
#checking for infinite values
np.all(np.isfinite(train1))

True

In [10]:
#Applying PCA
from sklearn.decomposition import PCA
pca = PCA(n_components = 5)
dataset_pca = pca.fit_transform(train1)
testset_pca = pca.fit_transform(test1)
explained_variance = pca.explained_variance_ratio_

In [11]:
type(dataset_pca)

numpy.ndarray

### Using a Simple Linear Regession model to predict

In [12]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(dataset_pca,y_train1)

# Predicting the Test set results
y_pred = reg.predict(testset_pca)
y_pred = sc.inverse_transform(y_pred)

### Using a Polynomial Linear Regession model

In [13]:
# Fitting Polynomial Regression to the dataset
from sklearn.preprocessing import PolynomialFeatures
poly_reg = PolynomialFeatures(degree = 3)
train_poly = poly_reg.fit_transform(dataset_pca)
poly_reg.fit(train_poly, y_train1)
lin_reg_2 = LinearRegression()
lin_reg_2.fit(train_poly, y_train)

# Predicting the Test set results
y_pred2 = lin_reg_2.predict(poly_reg.fit_transform(testset_pca))
y_pred2 = sc.inverse_transform(y_pred2)

### Using Support Vector Regression

In [14]:
# Fitting SVR to the dataset
from sklearn.svm import SVR
reg2 = SVR(kernel = 'rbf')
reg2.fit(dataset_pca, y_train1)

# Predicting a new result
y_pred3 = reg2.predict(testset_pca)
y_pred3 = sc.inverse_transform(y_pred3)

### Using Random Forest Regression

In [15]:
# Fitting Random Forest Regression to the dataset
from sklearn.ensemble import RandomForestRegressor
reg3 = RandomForestRegressor(n_estimators = 10, random_state = 0)
reg3.fit(dataset_pca, y_train1)

# Predicting a new result
y_pred4 = reg3.predict(testset_pca)
y_pred4 = sc.inverse_transform(y_pred4)

### Exporting the predicted values into CSV Files

In [16]:
id = np.arange(1461,2920).reshape(1459,1)
y_pred3 = y_pred3.reshape(1459,1)
y_pred4 = y_pred4.reshape(1459,1)
combined = np.concatenate((id, y_pred),axis=1)
#combined2 = np.concatenate((id, y_pred2),axis=1)
combined3 = np.concatenate((id, y_pred3),axis=1)
combined4 = np.concatenate((id, y_pred4),axis=1)


In [181]:
combined4

array([[  1461. , 122900. ],
       [  1462. , 176840. ],
       [  1463. , 189120.2],
       ...,
       [  2917. , 169130. ],
       [  2918. , 124910. ],
       [  2919. , 186490. ]])

In [182]:
df1 = pd.DataFrame(data=combined)
# df2 = pd.DataFrame(data=combined2)
df3 = pd.DataFrame(data=combined3)
df4 = pd.DataFrame(data=combined4)
df1.rename(columns={0:'Id',1:'SalePrice'}, inplace = True)
df3.rename(columns={0:'Id',1:'SalePrice'}, inplace = True)
df4.rename(columns={0:'Id',1:'SalePrice'}, inplace = True)

In [183]:
df4.head()

Unnamed: 0,Id,SalePrice
0,1461.0,122900.0
1,1462.0,176840.0
2,1463.0,189120.2
3,1464.0,195320.0
4,1465.0,165449.0


In [185]:
df4.to_csv("submission4.csv",index=False)

In [184]:
df4.Id = df4.Id.astype('int32')