In [1]:
#Importing the needed libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()
sns.set_style('darkgrid')
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Normalizer,QuantileTransformer
import warnings
def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn #ignore annoying warning (from sklearn and seaborn)

In [2]:
model = LinearRegression()
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
lbl = LabelEncoder()

In [3]:
#importing dataset (training and testing data)
training_data0 = pd.read_csv("C:/Users/KATE/dsn/house_train_data.csv")
Atesting_data0 = pd.read_csv("C:/Users/KATE/dsn/house_test_data.csv")
training_data0.shape
Atesting_data0.shape

(1459, 80)

In [4]:
# extracting the target variable from the training dataset
training_Y = training_data0['SalePrice']
training_Y.shape

(1460,)

In [None]:
# combining both data for easy transformation.
training_data = pd.concat([training_data0, Atesting_data0],ignore_index=True,join_axes=[training_data0.columns])
training_data.shape

(2919, 81)

In [None]:
#checking missing percentage
all_data_na = (training_data.isnull().sum() / len(training_data)) * 100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)[:30]
missing_data = pd.DataFrame({'Missing Ratio' :all_data_na})
missing_data.head(20)

Unnamed: 0,Missing Ratio
PoolQC,99.657417
MiscFeature,96.402878
Alley,93.216855
Fence,80.438506
SalePrice,49.982871
FireplaceQu,48.646797
LotFrontage,16.649538
GarageYrBlt,5.447071
GarageFinish,5.447071
GarageQual,5.447071


In [None]:
'''# HANDLING MISSING VALUES '''
#deleting features with non values greateer than 10% and SalePrice column
training_data_dropna = training_data.drop(['PoolQC','MiscFeature','Alley','Fence','FireplaceQu','LotFrontage','SalePrice'], axis=1)

In [None]:
#imputing those with less than 10% missing values with the most frequent values in their columns
training_imp =imputer.fit_transform(training_data_dropna)
#putting a dataframe 
training_imp2 = pd.DataFrame(training_imp, columns=training_data_dropna.columns)
#print('training_imp2.head: \n',training_imp2.head())

In [None]:
''' rechecking for missing values '''
Null = pd.DataFrame(training_imp2.isnull().sum().sort_values(ascending=False))
Null.columns = ['Null Count']
Null.index.name = 'Feature' 
#print('number of missing values in training_imp: \n', Null.head(20))

In [None]:
#dropping id column
training_X = training_imp2.drop(['Id'],axis = 1)

In [None]:
'''        FEATURE ENGINEERING'''
#Extracting categorical features
categorical_features = training_X.select_dtypes(include=[object])
#print('columns of categorical_features.columns',categorical_features.columns)
#to check the uniquesness of each columns
#print(categorical_features.head().describe())

In [None]:
#to extract the numeric features of the dataset
numeric_features = training_data.select_dtypes(include=[np.number])
#print('legth of numeric_features.columns',numeric_features.columns)

In [None]:
'''LABEL ECODING OF CATEGORIZED FEATURES''' 
for column in categorical_features.columns:
    lbl = LabelEncoder() 
    training_X[column] = lbl.fit_transform(training_X[column].values) 
print('training_X after labelling: \n',training_X.iloc[0:10,0:5], training_X.shape)

training_X after labelling: 
    MSSubClass  MSZoning  LotArea  Street  LotShape
0           5         3      619       1         3
1           0         3      895       1         3
2           5         3     1266       1         0
3           6         3      883       1         0
4           5         3     1670       1         0
5           4         3     1650       1         0
6           0         3     1002       1         3
7           5         3     1074       1         0
8           4         4      251       1         3
9          15         3      405       1         3 (2919, 73)


In [None]:
''' ONE HOT ENCODING OF CATEGORIZED FEATURES'''
col_num = []
for col in categorical_features.columns:
    col_num += [training_X.columns.get_loc(col)]
for num in col_num:    
    onehotencoder = OneHotEncoder(categorical_features=[num])
    training_X = pd.DataFrame(onehotencoder.fit_transform(training_X).toarray())
print('training_X after onehotencoder: \n',training_X.iloc[0:10,0:5],training_X.shape)

In [None]:
''' TO GET DUMMIES'''  
training_X = pd.get_dummies(training_X)
#print('training_X after dummies: \n',training_X.iloc[0:10,0:5], training_X.shape)

In [None]:
'''SPLITTING THE TRAIN.CSV AND TEST DATA.CSV '''
train_sets = training_X.loc[0:1459,:]
print(train_sets.shape)
test_sets = training_X.loc[1460:2919,:]
print(test_sets.shape)

In [None]:
'''SPLITTING TRAIN DATASET FOR MODEL VALIDATION '''
X_train, X_test, y_train, y_test = train_test_split(train_sets, np.log1p(training_Y), test_size=0.3, random_state=42)
print(len(X_train),len(X_test),len(y_train),len(y_test))
#print('X_train before normalized:\n', X_train.iloc[0:10,0:10], X_train.shape)

In [None]:
''' NORMALIZING THE X_TRAIN AND X_TEST
NOTE: When Normalizing, you fit and transform the training data  but only transform the testing data'''
#using Standardization
normalizer = Normalizer()
X_train_n = normalizer.fit_transform(X_train)
X_test_n = normalizer.transform(X_test)
print('X_train_n after normalized:\n',pd.DataFrame(X_train_n[0:10,0:10]), X_train_n.shape)

In [None]:
#fitting the model on the normalized train X and y train
model_fit = model.fit(X_train_n, y_train)
#predicting the target on  normalized test data
model_pred = pd.DataFrame(np.exp(model.predict(X_test_n)))
#print(model_pred[0:10])
#print('RMSE is : \n', np.sqrt(mean_squared_error(y_test, model_pred)))

In [None]:
''' visualizing'''
plt.scatter(y_test, model_pred, alpha= .7)
plt.xlabel('Predicted price')
plt.ylabel(' Actual Price')
plt.title('Linear Regression Model')

In [None]:
'''PREDICTING THE TARGET VALUES OF TESTING DATA.CSV'''

normalizer = Normalizer()
test_sets_n = normalizer.transform(test_sets)
final_model_pred = np.exp(model.predict(test_sets_n))
final_price_pred = pd.DataFrame(final_model_pred, columns =['SalePrice'], index = Atesting_data0['Id'])
print(final_price_pred[0:10])
#final_price_pred.to_csv ('price_pred_submission4.csv')