**Importing Libraries**

In [1]:
import seaborn as sns#For data visualisation 

#GPU Libraries
import cudf as pd
import cupy as cp
import cuml
from cuml import LinearRegression
from cuml.linear_model import LinearRegression
from cuml import Ridge
from cuml.linear_model import Ridge
from cuml.model_selection import train_test_split
from cuml.linear_model import Lasso
from cuml.ensemble import RandomForestRegressor

**Loading Data**

In [2]:
train_df=pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
test_df=pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

In [3]:
train_df.shape

In [4]:
test_df.shape

In [5]:
train_df.head()

In [6]:
test_df.head()

In [7]:
train_df.describe()

In [8]:
test_df.describe()

In [9]:
train_df.info()

In [10]:
test_df.info()

In [11]:
#Calculating the number of training and testing examples
num_train,num_test = train_df.shape[0],test_df.shape[0]

In [12]:
num_train

In [13]:
num_test

**Data Preprocessing**

In [14]:
Y = train_df['SalePrice']
train_df.drop('SalePrice',axis=1,inplace=True)

In [15]:
#Counting the number of missing values in training and Test Data
train_df.isnull().sum()

In [16]:
test_df.isnull().sum()

In [17]:
#Trying to visualise all the null values through heatmap
train1=train_df
train1=train1.to_pandas()
sns.heatmap(train1.isnull())

In [18]:
test1=test_df
test1=test_df.to_pandas()
sns.heatmap(test1.isnull())

In [19]:
#Counting number of categorical and numerical features
numerical_feats = train_df.dtypes[train_df.dtypes != "object"].index
print("Number of Numerical features: ", len(numerical_feats))

categorical_feats = train_df.dtypes[train_df.dtypes == "object"].index
print("Number of Categorical features: ", len(categorical_feats))

In [20]:
#Listing all the categorical Features
for catg in list(categorical_feats) :
    print(train_df[catg].value_counts())
    print('#'*50)

**Handling Null Values**- The dataset has a large number of missing values

1.) Drop all features having more than 50% NULL Values
2.) Take mode of all categorical Features
3.) Mean of numeric features

In [21]:
train_df['LotFrontage'].value_counts()

In [22]:
#Fill LotFrontage null values with mean
train_df['LotFrontage']=train_df['LotFrontage'].fillna(train_df['LotFrontage'].mean())
train_df['LotFrontage'].value_counts()

In [23]:
test_df['LotFrontage']=test_df['LotFrontage'].fillna(test_df['LotFrontage'].mean())
test_df['LotFrontage'].value_counts()

In [24]:
#Handling Categorical Features for Training Data
train_df['BsmtCond']=train_df['BsmtCond'].fillna(train_df['BsmtCond'].mode()[0])
train_df['BsmtQual']=train_df['BsmtQual'].fillna(train_df['BsmtQual'].mode()[0])
train_df['FireplaceQu']=train_df['FireplaceQu'].fillna(train_df['FireplaceQu'].mode()[0])
train_df['GarageType']=train_df['GarageType'].fillna(train_df['GarageType'].mode()[0])
train_df['GarageFinish']=train_df['GarageFinish'].fillna(train_df['GarageFinish'].mode()[0])
train_df['GarageQual']=train_df['GarageQual'].fillna(train_df['GarageQual'].mode()[0])
train_df['GarageCond']=train_df['GarageCond'].fillna(train_df['GarageCond'].mode()[0])
train_df['MasVnrType']=train_df['MasVnrType'].fillna(train_df['MasVnrType'].mode()[0])
train_df['MasVnrArea']=train_df['MasVnrArea'].fillna(train_df['MasVnrArea'].mode()[0])
train_df['BsmtExposure']=train_df['BsmtExposure'].fillna(train_df['BsmtExposure'].mode()[0])
train_df['BsmtFinType2']=train_df['BsmtFinType2'].fillna(train_df['BsmtFinType2'].mode()[0])
train_df['Utilities']=train_df['Utilities'].fillna(train_df['Utilities'].mode()[0])
train_df['Exterior1st']=train_df['Exterior1st'].fillna(train_df['Exterior1st'].mode()[0])
train_df['Exterior2nd']=train_df['Exterior2nd'].fillna(train_df['Exterior2nd'].mode()[0])
train_df['BsmtFinType1']=train_df['BsmtFinType1'].fillna(train_df['BsmtFinType1'].mode()[0])
train_df['SaleType']=train_df['SaleType'].fillna(train_df['SaleType'].mode()[0])
train_df['BsmtFullBath']=train_df['BsmtFullBath'].fillna(train_df['BsmtFullBath'].mode()[0])
train_df['BsmtHalfBath']=train_df['BsmtHalfBath'].fillna(train_df['BsmtHalfBath'].mode()[0])
train_df['KitchenQual']=train_df['KitchenQual'].fillna(train_df['KitchenQual'].mode()[0])
train_df['Functional']=train_df['Functional'].fillna(train_df['Functional'].mode()[0])
train_df['GarageCars']=train_df['GarageCars'].fillna(train_df['GarageCars'].mean())
train_df['GarageArea']=train_df['GarageArea'].fillna(train_df['GarageArea'].mean())
train_df['BsmtFinSF1']=train_df['BsmtFinSF1'].fillna(train_df['BsmtFinSF1'].mean())
train_df['BsmtFinSF2']=train_df['BsmtFinSF2'].fillna(train_df['BsmtFinSF2'].mean())
train_df['BsmtUnfSF']=train_df['BsmtUnfSF'].fillna(train_df['BsmtUnfSF'].mean())
train_df['TotalBsmtSF']=train_df['TotalBsmtSF'].fillna(train_df['TotalBsmtSF'].mean())

In [25]:
#Handling Categorical Features for Test Data
test_df['BsmtCond']=test_df['BsmtCond'].fillna(test_df['BsmtCond'].mode()[0])
test_df['BsmtQual']=test_df['BsmtQual'].fillna(test_df['BsmtQual'].mode()[0])
test_df['FireplaceQu']=test_df['FireplaceQu'].fillna(test_df['FireplaceQu'].mode()[0])
test_df['GarageType']=test_df['GarageType'].fillna(test_df['GarageType'].mode()[0])
test_df['GarageFinish']=test_df['GarageFinish'].fillna(test_df['GarageFinish'].mode()[0])
test_df['GarageQual']=test_df['GarageQual'].fillna(test_df['GarageQual'].mode()[0])
test_df['GarageCond']=test_df['GarageCond'].fillna(test_df['GarageCond'].mode()[0])
test_df['MasVnrType']=test_df['MasVnrType'].fillna(test_df['MasVnrType'].mode()[0])
test_df['MasVnrArea']=test_df['MasVnrArea'].fillna(test_df['MasVnrArea'].mode()[0])
test_df['BsmtExposure']=test_df['BsmtExposure'].fillna(test_df['BsmtExposure'].mode()[0])
test_df['BsmtFinType2']=test_df['BsmtFinType2'].fillna(test_df['BsmtFinType2'].mode()[0])
test_df['MSZoning']=test_df['MSZoning'].fillna(test_df['MSZoning'].mode()[0])
test_df['Utilities']=test_df['Utilities'].fillna(test_df['Utilities'].mode()[0])
test_df['Exterior1st']=test_df['Exterior1st'].fillna(test_df['Exterior1st'].mode()[0])
test_df['Exterior2nd']=test_df['Exterior2nd'].fillna(test_df['Exterior2nd'].mode()[0])
test_df['BsmtFinType1']=test_df['BsmtFinType1'].fillna(test_df['BsmtFinType1'].mode()[0])
test_df['SaleType']=test_df['SaleType'].fillna(test_df['SaleType'].mode()[0])
test_df['BsmtFullBath']=test_df['BsmtFullBath'].fillna(test_df['BsmtFullBath'].mode()[0])
test_df['BsmtHalfBath']=test_df['BsmtHalfBath'].fillna(test_df['BsmtHalfBath'].mode()[0])
test_df['KitchenQual']=test_df['KitchenQual'].fillna(test_df['KitchenQual'].mode()[0])
test_df['Functional']=test_df['Functional'].fillna(test_df['Functional'].mode()[0])
test_df['GarageCars']=test_df['GarageCars'].fillna(test_df['GarageCars'].mean())
test_df['GarageArea']=test_df['GarageArea'].fillna(test_df['GarageArea'].mean())
test_df['BsmtFinSF1']=test_df['BsmtFinSF1'].fillna(test_df['BsmtFinSF1'].mean())
test_df['BsmtFinSF2']=test_df['BsmtFinSF2'].fillna(test_df['BsmtFinSF2'].mean())
test_df['BsmtUnfSF']=test_df['BsmtUnfSF'].fillna(test_df['BsmtUnfSF'].mean())
test_df['TotalBsmtSF']=test_df['TotalBsmtSF'].fillna(test_df['TotalBsmtSF'].mean())


In [26]:
#Dropping values whose missing values are more than 50% and id as it is not a helpful feature
train_df.drop(['Alley','PoolQC','Fence','MiscFeature','GarageYrBlt','Id'],axis=1,inplace=True)

In [27]:
test_df.drop(['Alley','PoolQC','Fence','MiscFeature','GarageYrBlt','Id'],axis=1,inplace=True)

In [28]:
train_df.shape

In [29]:
test_df.shape

In [30]:
#Visualising the training and testing data again after handling null values
train1=train_df
train1=train1.to_pandas()
sns.heatmap(train1.isnull(),cmap='plasma')

In [31]:
test1=test_df
test1=test1.to_pandas()
sns.heatmap(test1.isnull(),cmap='plasma')

In [32]:
train_df.isnull().sum()

In [33]:
#Checking for remaining null values in train data
train_df.isnull().any().any()


In [34]:
#Finding the columns which have null values
train1.columns[train1.isnull().any()].tolist()

In [35]:
#Fill the remaining null values with the previous value
train_df=train_df.fillna(method ='pad')

In [36]:
train_df.isnull().any().any()

In [37]:
test_df.isnull().any().any()

In [38]:
#Concatenation of train and test datasets as we have to create  dummies of categorical features
final_df=pd.concat([train_df,test_df],axis=0)

In [39]:
final_df.shape

In [40]:
final_df.head()

In [41]:
final_df=pd.get_dummies(final_df)

In [42]:
final_df.shape

In [43]:
final_df.head()

In [44]:
#Splitting the dataset in train and test
train_df = final_df.iloc[:num_train,:]
test_df = final_df.iloc[num_test+1:,:]

In [45]:
train_df.shape

In [46]:
test_df.shape

In [47]:
X=train_df

In [48]:
X.shape

In [49]:
Y.shape

**Model Creation-Linear Regression**

1) With Train Test Split

In [50]:
#Splitting the dataset

X_train, X_test, y_train, y_test = train_test_split(X, Y,test_size=0.29,random_state=42)

In [51]:
X.isnull().any().any()

In [52]:
Y.isnull().any().any()

In [53]:
X_train.isnull().any().any()

In [54]:
X_test.isnull().any().any()

In [55]:
y_train.isnull().any().any()

In [56]:
y_test.isnull().any().any()

In [57]:
#Creating the model and fitting it
linreg = LinearRegression(fit_intercept = True, normalize = False, algorithm = 'svd-jacobi')
split=linreg.fit(X,Y)

In [58]:
print("R-Squared Value for Training Set: {:.3f}".format(linreg.score(X_train.astype('float32'), y_train.astype('float32'))))

In [59]:
test_df.shape

In [60]:
#Predictions
y_pred_split=linreg.predict(test_df)

In [61]:
y_pred_split

In [62]:
y_pred_split.shape

In [63]:
#All 5 linear Regression Algorithms
algorithm = ['svd', 'eig', 'svd-qr', 'svd-jacobi']

#There is another algorithm 'qr' but when running this algorithm, it shows cannot handle missiing values but there are no missing values in the dataset as it was checked in the previous cells and all the other algorithms were running

In [64]:
#Evaluation Metrics for all 5 algorithms
for i in algorithm:
    print("Algorithm:")
    print(i)
    lr = LinearRegression(fit_intercept = True, normalize = False, algorithm = i)
    reg = lr.fit(X_train,y_train)
    preds = lr.predict(X_test)
    print("MSE:")
    print(cuml.metrics.regression.mean_squared_error(y_test.astype('int64'),preds.astype('int64')))
    print("R2 Score:")
    print(cuml.metrics.regression.r2_score(y_test.astype('float32'),preds.astype('float32')))
    print("MAE:")
    print(cuml.metrics.regression.mean_absolute_error(y_test.astype('int64'),preds.astype('int64')))



In [65]:
sample_sub = pd.read_csv('../input/house-prices-advanced-regression-techniques/sample_submission.csv')
sample_sub.head()

In [66]:
sample_sub['SalePrice']

In [67]:
sample_sub['SalePrice'] = y_pred_split

sample_sub.to_csv('submission1.csv', index=False)


In [68]:
sample_sub['SalePrice']

**Optional**

**1)Ridge Regression**

In [69]:
alpha = cp.array([1e-5])
ridge1 = Ridge(alpha = alpha, fit_intercept = True, normalize = False,
solver = "eig")
model1=ridge1.fit(X,Y)
y_pred_ridge1=model1.predict(test_df)
    

In [70]:
y_pred_ridge1

In [71]:
alpha = cp.array([1e-5])
ridge2 = Ridge(alpha = alpha, fit_intercept = True, normalize = False,
solver = "svd")
model2=ridge2.fit(X,Y)
y_pred_ridge2=model2.predict(test_df)

In [72]:
y_pred_ridge2

There is another ridge regression algorithm of cvd,but it shows the error that solver-cvd is not supported

In [73]:
# alpha = cp.array([1e-5])
# ridge3 = Ridge(alpha = alpha, fit_intercept = True, normalize = False,
# solver = "cvd")
# model3=ridge3.fit(X,Y)
# y_pred_ridge3=model3.predict(test_df)

In [74]:
# print("R-Squared Value for Training Set: {:.3f}".format(model3.score(X.astype('float32'), Y.astype('float32'))))

In [75]:
#Submission
sample_sub['SalePrice'] = y_pred_ridge1

sample_sub.to_csv('submission2.csv', index=False)


In [76]:
#Submission
sample_sub['SalePrice'] = y_pred_ridge2

sample_sub.to_csv('submission3.csv', index=False)


**2)Lasso Regression**

In [77]:
ls = Lasso(alpha = 0.1)
result_lasso = ls.fit(X, Y)
y_pred_lasso=result_lasso.predict(test_df)

In [78]:
y_pred_lasso

In [79]:
#Submission
sample_sub['SalePrice'] = y_pred_lasso

sample_sub.to_csv('submission4.csv', index=False)


Trying  RandomForestRegressor only for improving model accuracy for Kaggle competition only(not a part of the assignment).

In [80]:
forest_regressor = RandomForestRegressor(n_estimators = 250)
forest_regressor.fit(X.values.astype('float32'), Y.astype('float32'))

In [81]:
y_pred_random= forest_regressor.predict(test_df)

In [82]:
y_pred_random

In [83]:
print("R-Squared Value for Training Set: {:.3f}".format(forest_regressor.score(X.astype('float32'), Y.astype('float32'))))

In [84]:
sample_sub['SalePrice'] = y_pred_random

sample_sub.to_csv('submission5.csv', index=False)
