In [357]:
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
from scipy.stats import randint
import seaborn as sns
from sklearn.feature_selection import chi2, VarianceThreshold
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder , LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.feature_selection import RFECV
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
# reading
data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/train.csv") 


In [358]:
data=data.set_index('Id') #for reseting index as Id
data.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


In [359]:
data['MSZoning'].apply(str)

Id
1       RL
2       RL
3       RL
4       RL
5       RL
        ..
1456    RL
1457    RL
1458    RL
1459    RL
1460    RL
Name: MSZoning, Length: 1460, dtype: object

In [360]:
data.isna().sum().sort_values(ascending = False) # to see nan values

PoolQC         1453
MiscFeature    1406
Alley          1369
Fence          1179
FireplaceQu     690
               ... 
Heating           0
HeatingQC         0
MSZoning          0
1stFlrSF          0
SalePrice         0
Length: 80, dtype: int64

In [361]:
P1 =data[['Alley','PoolQC','Fence','MiscFeature','FireplaceQu']]
per =P1.isnull().sum()/len(P1)*100 # to check null values in percentage
per

Alley          93.767123
PoolQC         99.520548
Fence          80.753425
MiscFeature    96.301370
FireplaceQu    47.260274
dtype: float64

In [362]:
#Percentage of missing values is more then 80% 

data=data.drop(['Alley','PoolQC','Fence','MiscFeature'], axis = 1)
data.shape

(1460, 76)

In [363]:
#drop the traget column
X= data
y= X.pop('SalePrice')

In [364]:
print(data.shape)

(1460, 75)


spliting the data set

In [365]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state= 99900)

In [366]:
# select categorical and numerical column names
X_cat_columns = X.select_dtypes(exclude="number").copy().columns 
X_num_columns = X.select_dtypes(include="number").copy().columns

# create numerical pipeline, only with the SimpleImputer(strategy="median") and data scaling
scaler = MinMaxScaler()
numeric_pipe = make_pipeline(scaler,
                             SimpleImputer(strategy="median"))
                             
 
# create categorical pipeline, with the SimpleImputer(fill_value="N_A") and the OneHotEncoder

categoric_pipe = make_pipeline(
    SimpleImputer(strategy="most_frequent"), # you can select this one also strategy="constant", fill_value="N_A"
    OneHotEncoder(handle_unknown = 'ignore', sparse=False) # covert all categorical data in the form of 0 and 1
)

Pipelines can contain many different steps inside. I would divide them into 2 groups: Preprocessing pipelines and Modelling pipelines. A Modelling pipeline has a model as their last step, whereas a preprocessing pipeline doesn't.


- Preprocessing pipelines: Those pipelines only transform the predictor features (the X) by filling NAs, encoding categorical features, scaling, etc. You always have to fit them with X_train. Then, you can call the .transform() method to transform both the X_train and the X_test. (Sometimes, you fit and transform X_train in a single step, by using the .fit_transform() method, but you're still performing these 2 separate steps). Any time that you call transform() you get as an output the transformed data, X_train or X_test.

In [367]:
from sklearn.compose import ColumnTransformer  #make_column_Transformer then dont need to mention names

preprocessor = ColumnTransformer(
    transformers=[
        ("num_pipe", numeric_pipe, X_num_columns),
        ("cat_pipe", categoric_pipe, X_cat_columns)
      
    ]
)

In [368]:
# To display pipeline
from sklearn import set_config
set_config(display = 'diagram')

## Using Linear Regression
The variable(SalePrice) you want to predict is called the dependent variable.

In [369]:
performances = {}

In [370]:
preprocessor.fit_transform(X_train).shape

(1168, 272)

In [371]:
from sklearn.linear_model import LinearRegression
full_pipe_LR = make_pipeline(
    preprocessor,
    LinearRegression())

full_pipe_LR.fit(X_train, y_train)

LR_pred = full_pipe_LR.predict(X_test)

performances["baseline_LR"]= r2_score(y_test, LR_pred)
performances

{'baseline_LR': -7.958116153434094e+16}

In [372]:
# LR with PCA 

from sklearn.linear_model import LinearRegression
full_pipe_LR = make_pipeline(
    preprocessor,
    PCA(n_components=97),
    LinearRegression())

full_pipe_LR.fit(X_train, y_train)

LR_pred = full_pipe_LR.predict(X_test)

performances["PCA95_LR"]= r2_score(y_test, LR_pred)
performances

{'PCA95_LR': 0.8502907001618483, 'baseline_LR': -7.958116153434094e+16}

##XGBRegressor

In [373]:
from xgboost import XGBRegressor

#Using pipeline
full_pipe_XGB = make_pipeline(
    preprocessor,
    XGBRegressor())

full_pipe_XGB.fit(X_train, y_train)

XGB_pred = full_pipe_XGB.predict(X_test)

performances["XGB_pred"]= r2_score(y_test, XGB_pred)
performances



{'PCA95_LR': 0.8502907001618483,
 'XGB_pred': 0.8867040353685399,
 'baseline_LR': -7.958116153434094e+16}

## Using Random Forest Regressor

In [374]:
# MODEL BUILDING # 
from sklearn.ensemble import RandomForestRegressor
RF = RandomForestRegressor(n_estimators=150, random_state=0)

In [375]:
full_pipeline = make_pipeline(preprocessor,
                              RF)

In [376]:
full_pipeline.fit(X_train, y_train)

In [377]:
preds = full_pipeline.predict(X_train)
preds[0:5]

array([300639.45333333, 242014.85333333, 229889.95333333, 110472.03333333,
       128784.8       ])

### Check on test file

In [378]:
test_data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/test.csv")

In [379]:
test_data.set_index("Id", inplace=True)  #reset index as Id

In [380]:
##Percentage of missing values

Pt =test_data[['Alley','PoolQC','Fence','MiscFeature','FireplaceQu']]  
pert =P1.isnull().sum()/len(P1)*100
pert

Alley          93.767123
PoolQC         99.520548
Fence          80.753425
MiscFeature    96.301370
FireplaceQu    47.260274
dtype: float64

In [381]:
# Percentage of missing values is more then 80% and that's dropp it

test_data=test_data.drop(['Alley','PoolQC','Fence','MiscFeature'], axis = 1)

In [382]:
# To predict test data using XGB full pipeline
test_preds = full_pipe_XGB.predict(test_data)  


In [383]:
# Two cloumns are needed to make dataframe 
result = pd.DataFrame({'Id': test_data.index,             
                       'SalePrice': test_preds})


In [384]:
result

Unnamed: 0,Id,SalePrice
0,1461,122137.718750
1,1462,166652.609375
2,1463,174351.421875
3,1464,184163.312500
4,1465,200159.062500
...,...,...
1454,2915,82725.750000
1455,2916,80852.195312
1456,2917,171065.781250
1457,2918,123377.351562


In [385]:
# To covert dataframe to CSV
result.to_csv('Submission.csv', index=False)

In [386]:
# For download CSV file
from google.colab import files
files.download("Submission.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>