In [581]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from google.colab import files

#reading csv file  and X and y creation
X= pd.read_csv('housing-classification-iter6.csv')
y = X.pop("Expensive")

# data splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=123)


### Creating the "numeric pipe" and the "categoric pipe"

In [582]:
# select categorical and numerical column names
X_cat_columns = X_train.select_dtypes(exclude="object").copy().columns
X_num_columns = X_train.select_dtypes(include="object").copy().columns

#categoric_features = list(X_train.select_dtypes(include=["object"]))
#numeric_features = list(X_train.select_dtypes(exclude=["object"]))

# create numerical pipeline, only with the SimpleImputer(strategy="mean")
numeric_pipe = make_pipeline(
              SimpleImputer(strategy="mean"))
 
# create categorical pipeline, with the SimpleImputer(fill_value="N_A") and the OneHotEncoder
categoric_pipe = make_pipeline(
                 SimpleImputer(strategy="constant", fill_value="N_A"),
                 OneHotEncoder(handle_unknown='ignore')
)


### Using ColumnTransformer a pipeline with 2 branches (the preprocessor)

In [583]:
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(transformers = 
                                 [("num_pipe", numeric_pipe, X_cat_columns),
                                  ("cat_pipe", categoric_pipe, X_num_columns)]
                                )

### Creating the full_pipeline (preprocessor + Decision Tree)

In [584]:
full_pipeline = make_pipeline(preprocessor, RandomForestClassifier())

In [585]:
full_pipeline.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('num_pipe',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer())]),
                                                  Index(['LotArea', 'LotFrontage', 'TotalBsmtSF', 'BedroomAbvGr', 'Fireplaces',
       'PoolArea', 'GarageCars', 'WoodDeckSF', 'ScreenPorch', 'MSSubClass',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea',
       'BsmtFinSF1'...
       'Neighborhood', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'BsmtFinType2',
       'HeatingQC', 'Electrical', 'Functional', 'GarageType', 'GarageFinish',
       'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence',
       'MiscFeature', 'SaleType', 'SaleCondition'],
      dtype='object'))])),
                ('randomforestclassifier',

In [586]:
y_pred_train = full_pipeline.predict(X_train)
score = accuracy_score(y_true = y_train, y_pred = y_pred_train)
score

1.0

### Predicting whether Houses are expensive or not

In [587]:
y_pred_test = full_pipeline.predict(X_test)

In [588]:
# reading test data
test_data = pd.read_csv("test.csv")
test_data = test_data.loc[:, ~test_data.columns.str.contains('^Unnamed')]

# making prediction and converting it into dataframe
prediction=full_pipeline.predict(test_data)
output_df=pd.DataFrame({'Id':test_data['Id'],'Expensive':prediction})
output_df.head()

Unnamed: 0,Id,Expensive
0,1461,0
1,1462,0
2,1463,0
3,1464,0
4,1465,0


In [589]:
# converting dataframe to csv file and downloading it
output_df.to_csv("Housing_Expensive.csv", index=False)
files.download("Housing_Expensive.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>