In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
X = df.loc[:,df.columns!='SalePrice']
y = df['SalePrice']

In [None]:
missing_vals = X.isnull().sum().reset_index()
missing_vals.columns = ['column','no. of missing values']
plt.figure(figsize=(18, 6))
sns.barplot(x=missing_vals.iloc[:, 0], y=missing_vals.iloc[:, -1], palette='crest')

plt.title('Analysis of column-wise missing values')
plt.xlabel('Columns')
plt.ylabel('Number of null values')
plt.xticks(rotation=-90)
plt.show()

# Dropping the columns with high null values...

In [None]:
X.drop(columns=['Alley', 'MasVnrType', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature'])
X = pd.get_dummies(data=X)

# Encoding categorical var.s

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
cols = X.columns
idx = X.index
X = pd.DataFrame(imputer.fit_transform(X))
X.columns = cols
X.index = idx

# Finding and ploting top features based on correlation

In [None]:
k = df.columns.size
correlation_values = X.drop(columns=['Id']).apply(lambda feature: np.abs(np.corrcoef(feature, y)[0, 1]))
sorted_features = correlation_values.sort_values(ascending=False)
selected_features = sorted_features.iloc[:k].index
selected_features = ['Id'] + list(selected_features)

In [None]:
plt.figure(figsize=(18, 6))
sns.barplot(x=sorted_features.index[:k], y=sorted_features[:k], palette='crest')
plt.xticks(rotation=-90)
plt.xlabel('Features')
plt.ylabel('Absolute Correlation')
plt.title('Top \'K\' features and their Absolute Correlation to Target')
plt.tight_layout()
plt.show()

# Training the linear regression model

In [None]:
X = X[selected_features]
# selected_features.remove('Id')
# X_te = X_te[selected_features]
# print(len(selected_features))
# .reshape((1459, 270))
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, random_state=0)
lin_regressor = LinearRegression()
lin_regressor.fit(X_tr,y_tr)

In [345]:
test_predictions = lin_regressor.predict(X_te)
submission = pd.DataFrame({
    'Id': np.int32(X_te['Id']),
    'SalePrice': test_predictions
})
submission.to_csv('submission.csv', index=False)
