## **Importing the libraries**

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

## **Loading the dataset and collect the information** 

In [None]:
df_test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')
df_train=pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')

In [None]:
df_test

In [None]:
df_train

In [None]:
print(df_train.columns)
print(len(df_train.columns),"fetures present in training dataset")


In [None]:
print(df_test.columns)
print(len(df_test.columns),"fetures present in testing dataset")


In [None]:
print(df_test.shape,"shape of testing dataset")
print(df_train.shape,"shape of training dataset")

In [None]:
df_train.info()

In [None]:
df_test.info()

In [None]:
df_train.describe()

## Data Pre-Processing

#check whether the dataset have duplicate value

In [None]:
df_train.duplicated().sum()

In [None]:
df_test.duplicated().sum()

check whether the dataset have Null values

In [None]:
print(df_train.isnull().sum())
print('-----------')
print('-----------')
print('-----------')
print(df_test.isnull().sum())

#sorting the column according to highest null values

In [None]:
dataframe = pd.DataFrame(df_train.isnull().sum().sort_values(ascending=False))
 
print(dataframe.to_markdown())

In [None]:
dataframe_test = pd.DataFrame(df_test.isnull().sum().sort_values(ascending=False))
print(dataframe_test.to_markdown())

In [None]:
null=df_train.isnull().sum()/df_train.shape[0]*100
null

In [None]:
dataframe_null = pd.DataFrame(null.sort_values(ascending=False))
print(dataframe_null.to_markdown())

#dropping the columns which have null percent higher than 50%


In [None]:
col_to_drop = null[null>50].keys()
train_df=df_train.drop(col_to_drop, axis=1)


In [None]:
null_test=df_test.isnull().sum()/df_test.shape[0]*100
null_test

In [None]:
dataframe_null_test = pd.DataFrame(null_test.sort_values(ascending=False))
print(dataframe_null_test.to_markdown())

In [None]:
col_to_drop_test= null_test[null_test>50].keys()
test_df=df_test.drop(col_to_drop, axis=1)

In [None]:
len(train_df.columns)

In [None]:
len(test_df.columns)

In [None]:
train_df.columns[train_df.isnull().any()]

In [None]:
len(train_df.columns[train_df.isnull().any()])

# Checking the skew value to fill the null values in the dataset

In [None]:
train_df.skew()

# fill the missing value

In [None]:
train_df['LotFrontage'] = train_df['LotFrontage'].fillna(train_df['LotFrontage'].mean())
train_df['MasVnrArea']  = train_df['MasVnrArea'].fillna(train_df['MasVnrArea'].mean())  
train_df['GarageYrBlt'] = train_df['GarageYrBlt'].fillna(train_df['GarageYrBlt'].median())

For numerical data , 
      
       

*   if skew value is postiive use mean to fill the missing value.
*   if skew value is negative use median to fill the missing value.
*   if skew value is zero, use either mean or median to fill the missing value.

              

In [None]:
train_df['MasVnrType']   = train_df['MasVnrType'].fillna(train_df['MasVnrType'].mode()[0])  
train_df['BsmtQual']     = train_df['BsmtQual'].fillna(train_df['BsmtQual'].mode()[0])  
train_df['BsmtCond']     = train_df['BsmtCond'].fillna(train_df['BsmtCond'].mode()[0])  
train_df['BsmtExposure'] = train_df['BsmtExposure'].fillna(train_df['BsmtExposure'].mode()[0])  
train_df['BsmtFinType1'] = train_df['BsmtFinType1'].fillna(train_df['BsmtFinType1'].mode()[0])  
train_df['BsmtFinType2'] = train_df['BsmtFinType2'].fillna(train_df['BsmtFinType2'].mode()[0])  
train_df['Electrical']   = train_df['Electrical'].fillna(train_df['Electrical'].mode()[0])  
train_df['FireplaceQu']  = train_df['FireplaceQu'].fillna(train_df['FireplaceQu'].mode()[0])  
train_df['GarageType']   = train_df['GarageType'].fillna(train_df['GarageType'].mode()[0])  
train_df['GarageFinish'] = train_df['GarageFinish'].fillna(train_df['GarageFinish'].mode()[0]) 
train_df['GarageQual']   = train_df['GarageQual'].fillna(train_df['GarageQual'].mode()[0])  
train_df['GarageCond']   = train_df['GarageCond'].fillna(train_df['GarageCond'].mode()[0])  

For the categorical values, use mode to fill the missing value




In [None]:
train_df.isnull().values.sum()

In [None]:
test_df.fillna(test_df.mode().iloc[0],inplace=True)

In [None]:
test_df.isnull().values.sum()

## **Correlation**

In [None]:
train_df.corr()

In [None]:
plt.figure(figsize=(18,16))
sns.heatmap(train_df.corr(),annot=True,linewidth=.7,)

In [None]:
corr=train_df.corr()

high_corr_features=corr.index[abs(corr['SalePrice'])>0.50]
print(f'highly correlated feature:\n',high_corr_features)
print(f'No. of highly correlated features:',len(high_corr_features))

In [None]:
plt.figure(figsize=(12,10))
sns.heatmap(train_df[high_corr_features].corr(),annot=True,linewidth=2,)

# convert categorical data into numerical data

In [None]:
train_df=pd.get_dummies(train_df,drop_first=True)
print(f'Train shape: {train_df.shape}')

In [None]:
test_df=pd.get_dummies(test_df,drop_first=True)
print(f'Test shape: {test_df.shape}')

In [None]:
len(train_df.select_dtypes(include='object').columns)
len(test_df.select_dtypes(include='object').columns)

## **Data Standardization and Splitting the Data**

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [None]:
X=train_df[high_corr_features.drop('SalePrice')]
y=train_df[['SalePrice']]

In [None]:
test_df=test_df[high_corr_features.drop('SalePrice')]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10)

In [None]:
print( X_train.shape, X_test.shape)

In [None]:
scaler = StandardScaler()

In [None]:
X_train=scaler.fit_transform(X_train)
X_test=scaler.fit_transform(X_test)
test_df=scaler.fit_transform(test_df)

## **Traning the Model**

In [None]:
from sklearn.linear_model import LinearRegression

#Create an instance of a LinearRegression() model named lm.

In [None]:
lr = LinearRegression()

In [None]:
lr.fit(X_train,y_train)

#Print out the coefficients of the model

In [None]:
print('Coefficients: \n', lr.coef_)

In [None]:
pred = lr.predict(X_test)

In [None]:
plt.scatter(y_test,pred)
plt.xlabel('Y Test')
plt.ylabel('Predicted Y')

In [None]:
from sklearn import metrics

print('MAE:', metrics.mean_absolute_error(y_test, pred))
print('MSE:', metrics.mean_squared_error(y_test, pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, pred)))

##**Predicting Test Data**

In [None]:
prediction=lr.predict(test_df)
prediction.shape

In [None]:
print(prediction)

In [None]:
ids = df_test['Id']

In [None]:
print(ids)

In [None]:
Final = pd.DataFrame({"Id":ids})

In [None]:
Final = pd.DataFrame({"Sales Price":prediction.flatten()})

In [None]:
Final_sub = pd.DataFrame({"Id":ids,"SalePrice":prediction.flatten()})

In [None]:
Final_sub.head(10)

In [None]:
Final_sub.to_csv('submission.csv', index = 'None')

# **COMPLETED**