In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv
/kaggle/input/house-prices-advanced-regression-techniques/data_description.txt
/kaggle/input/house-prices-advanced-regression-techniques/train.csv
/kaggle/input/house-prices-advanced-regression-techniques/test.csv


# Load Data (both train and test data)

In [2]:
# train data
df = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')

# test data
df_test = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')

# Obtain Categorical columns
categorical_cols = df.select_dtypes('object').keys()

# Process total dataframe

In [3]:
df_total = df.copy()

df_total.drop(['SalePrice'], axis=1, inplace=True)

# append two data sets
df_total = df_total.append(df_test,ignore_index=True)

# Change NaN in categorical columns to NA:
for i in categorical_cols:
    df_total[i].fillna(value='NA', inplace=True)

# Drop rows with NAN in non-categorical columns
df_total.dropna(inplace=True)

# Drop unneccessary columns:
df_total.drop(['Id'],axis=1, inplace=True)

# Process df1 as copy of train dataframe

In [4]:
df1 = df.copy()

# Change NaN in categorical columns to NA:
for i in categorical_cols:
    df1[i].fillna(value='NA', inplace=True)

# Drop rows with NAN in non-categorical columns
df1.dropna(inplace=True)

# Drop unnecessary columns:
df1.drop(['Id'],axis=1, inplace=True)

# Encoding Categorical data

In [5]:
# Process non-categorical columns
data_other_cols = df1.drop(columns=categorical_cols)
data_other_cols = data_other_cols.to_numpy()


from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(drop='if_binary', sparse=False),categorical_cols)], remainder='drop')
ct.fit(df_total)

df1_new = ct.transform(df1)

# convert to numpy sparse matrix
#df1_new = df1.toarray()

#Convert data type
data_other_cols = data_other_cols.astype(int)

# concatenate two matrices
df2 = np.concatenate((df1_new,data_other_cols), axis=1)

df1_new.shape

(1121, 259)

# Split X and y

In [6]:
X_train=df2[:,:-1]
y_train=df2[:,-1]

# Feature Scaling X_train and y_train

In [7]:
from sklearn.preprocessing import StandardScaler

# X_train scaler
scaler_X = StandardScaler()
X_train[:,260:] = scaler_X.fit_transform(X_train[:,260:])

# y_train scaler
scaler_y = StandardScaler()
y_train = scaler_y.fit_transform(y_train.reshape(-1,1))

# Random Forest

In [8]:
from sklearn.ensemble import RandomForestRegressor

Regressor = RandomForestRegressor(n_estimators= 100 , random_state = 1)

Regressor.fit(X_train,y_train)

  """


RandomForestRegressor(random_state=1)

# Now the test set

In [9]:
# Get a copy of test dataframe
df_test1 = df_test.copy()

# Change NaN in categorical columns to NA:
for i in categorical_cols:
    df_test1[i].fillna(value='NA', inplace=True)
    
# Get the list of rows with NaN in non-categorical rows:
Dropped_rows = df_test1[df_test1.isna().any(axis=1)]
Dropped_rows_Id = Dropped_rows['Id'].values

# Drop rows with NAN in non-categorical columns
df_test1.dropna(inplace=True)

# Drop unnecessary columns:
df_test1.drop(['Id'],axis=1, inplace=True)

#df_test1.info()

# Encoding Categorical data

In [10]:
# Obtain non-categorical columns
data_other_cols_test = df_test1.drop(columns=categorical_cols)
data_other_cols_test = data_other_cols_test.to_numpy()


df_test1_new = ct.transform(df_test1)

#Convert data type
data_other_cols_test = data_other_cols_test.astype(int)

# concatenate two matrices
df_test2 = np.concatenate((df_test1_new,data_other_cols_test), axis=1)

df_test1_new.shape

(1146, 259)

# Feature Scaling

In [11]:
df_test2[:,260:] = scaler_X.transform(df_test2[:,260:])

# Do predictions

In [12]:
# Use regressor to do prediction
predictions = Regressor.predict(df_test2).reshape(-1,1)

# Inverse scaling
predictions = scaler_y.inverse_transform(predictions)

print(predictions)

[[123832.  ]
 [156158.9 ]
 [188430.29]
 ...
 [ 90011.  ]
 [146117.79]
 [237703.23]]


# Import Sample predictions for calculating error

In [13]:
df_predictions_sample = pd.read_csv('../input/house-prices-advanced-regression-techniques/sample_submission.csv')


# Get a copy of the prediction samples
df_predictions_sample1 = df_predictions_sample.copy()
df_predictions_sample1.set_index('Id', inplace=True)

# Drop rows that already removed due to having NaN
df_predictions_sample1.drop(Dropped_rows_Id, inplace=True)

# Get the predictions
Predictions_sample = df_predictions_sample1.iloc[:,0]

# Calculate rmsle error

In [14]:
from sklearn.metrics import mean_squared_log_error

# calculate error msle
error = mean_squared_log_error(Predictions_sample, predictions)

# calcaulate rmsle
error = np.sqrt(error)

error

0.3532450963893092

# Save to a .csv file

In [15]:
submission = pd.DataFrame(predictions, columns=['SalePrice'], index=df_predictions_sample1.index)
submission.reset_index()

submission.to_csv('submission_reza.csv')