In [40]:
import numpy as np
import pandas as pd
import  matplotlib.pyplot as plt
from  sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectKBest,chi2,RFE
from sklearn.tree import ExtraTreeClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split,GridSearchCV
from imblearn.over_sampling import SMOTE,ADASYN

from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import  make_pipeline,Pipeline
from sklearn.metrics import r2_score,mean_absolute_error
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import f_regression
from sklearn.metrics import confusion_matrix,classification_report



In [41]:
url="https://raw.githubusercontent.com/digipodium/Datasets/main/regression/ames_housing_no_missing.csv"
df=pd.read_csv(url,index_col=False)
df.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,Grvl,Reg,Lvl,AllPub,Inside,...,0,Gd,MnPrv,Shed,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,Grvl,Reg,Lvl,AllPub,FR2,...,0,Gd,MnPrv,Shed,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,Grvl,IR1,Lvl,AllPub,Inside,...,0,Gd,MnPrv,Shed,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,Grvl,IR1,Lvl,AllPub,Corner,...,0,Gd,MnPrv,Shed,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,Grvl,IR1,Lvl,AllPub,FR2,...,0,Gd,MnPrv,Shed,0,12,2008,WD,Normal,250000


In [42]:
X=df.drop(columns="SalePrice")
y=df["SalePrice"]

In [43]:
numeric_transformer = Pipeline(
            steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
        )

categorical_transformer = OneHotEncoder(handle_unknown="ignore")
preprocessor = ColumnTransformer(
            transformers=[
                ("numeric", numeric_transformer, X.select_dtypes(np.number).columns.tolist()),
                ("category", categorical_transformer,X.select_dtypes("object").columns.tolist()),
            ]
        )


In [44]:
xtrain,xtest,ytrain,ytest=train_test_split(X,y,test_size=0.2,random_state=45)

In [45]:

feature_selector = Pipeline(
        steps=[("preprocessor", preprocessor),
        ("feature", SelectKBest(f_regression,k=3))]
    )
feature_selector.fit(xtrain,ytrain)


In [46]:
print(feature_selector.get_feature_names_out())

['numeric__OverallQual' 'numeric__GrLivArea' 'numeric__GarageCars']


In [47]:

model_selector = Pipeline(
        steps=[("preprocessor", preprocessor),
        ("feature", SelectKBest(f_regression,k=3)),
        ("classifier", RandomForestRegressor())]
    )
model_selector.fit(xtrain,ytrain)

In [48]:
model_selector = Pipeline(
        steps=[("preprocessor", preprocessor),
        ("feature", SelectKBest(f_regression,k=3)),
        ("classifier", RandomForestRegressor())]
    )
model_selector.fit(xtrain,ytrain)
model_selector.fit(xtrain,ytrain)
ypred=model_selector.predict(xtest)
result=r2_score(ytest,ypred)
print(f"R2score is {result}")

R2score is 0.802896677623937


In [49]:
parms={
    "classifier__n_estimators":[50,100,150,200,250],
    'classifier__criterion':["squared_error", "absolute_error", "poisson"],
    'classifier__max_depth':[10,15,20,25,30],
    # 'classifier__min_samples_leaf':[1,2]
}
# grid=GridSearchCV(clf,parms,cv=4,n_jobs=-1,verbose=3)
# grid.fit(xtrain,ytrain)

In [50]:
grid=GridSearchCV(model_selector,parms,cv=4,n_jobs=-1,verbose=3)
grid.fit(xtrain,ytrain)

Fitting 4 folds for each of 75 candidates, totalling 300 fits


KeyboardInterrupt: 

In [None]:
grid.best_params_

{'classifier__criterion': 'absolute_error',
 'classifier__max_depth': 10,
 'classifier__n_estimators': 250}

In [51]:
Listing=[]
def extracting():
    for i in range(1,25):
# ****************Feature Seclortot**********************************************
        feature_selector = Pipeline(
            steps=[("preprocessor", preprocessor),
            ("feature", SelectKBest(f_regression,k=i))])
        feature_selector.fit(xtrain,ytrain)
 # ****************Model Seclortot**********************************************
        model_selector = Pipeline(
            steps=[("preprocessor", preprocessor),
            ("feature", SelectKBest(f_regression,k=i)),
            ("classifier", RandomForestRegressor())]
        )
        model_selector.fit(xtrain,ytrain)\
# *********************************Hyper Parametet***********************************
        grid=GridSearchCV(model_selector,parms,cv=4,n_jobs=-1,verbose=3)
        grid.fit(xtrain,ytrain)
        feature=grid.best_params_

        model=grid.best_estimator_
        ypred_model=model.predict(xtest)
        result_model=r2_score(ytest,ypred_model)
        print(f"R2score of Result Model is : {result_model}")
#****************************Result Generation ******************************
        ypred=model_selector.predict(xtest)
        result=r2_score(ytest,ypred)
        print(f"R2score is {result}")
#*********************************Working on features****************************
        xopt=feature_selector.get_feature_names_out()
        feature_selection=[]
        for x in xopt:
            feature_selection.append(x.split("__")[1])
        print(feature_selection)
        print(f"***********************--{i}******************")

# ********************Colecting Data--***********************************************
        Listing.append({
            "i":i,
            "Error":result,
            "Error_model":result_model,
            "columns":feature_selection,
            "parameter":feature
        })
    

In [None]:
# extracting()

Fitting 4 folds for each of 75 candidates, totalling 300 fits
R2score of Result Model is : 0.6808031663042895
R2score is 0.6828793417754206
['OverallQual']
***********************--1******************
Fitting 4 folds for each of 75 candidates, totalling 300 fits
R2score of Result Model is : 0.7691353180961717
R2score is 0.7479018879949564
['OverallQual', 'GrLivArea']
***********************--2******************
Fitting 4 folds for each of 75 candidates, totalling 300 fits
R2score of Result Model is : 0.8176587464167182
R2score is 0.7960901000515997
['OverallQual', 'GrLivArea', 'GarageCars']
***********************--3******************
Fitting 4 folds for each of 75 candidates, totalling 300 fits
R2score of Result Model is : 0.828524569167296
R2score is 0.8203683122899574
['OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea']
***********************--4******************
Fitting 4 folds for each of 75 candidates, totalling 300 fits
R2score of Result Model is : 0.8455419306952175
R2scor

In [None]:
ls=pd.DataFrame(Listing)
print(ls)

     i     Error  Error_model  \
0    1  0.682879     0.680803   
1    2  0.747902     0.769135   
2    3  0.796090     0.817659   
3    4  0.820368     0.828525   
4    5  0.840456     0.845542   
5    6  0.837125     0.842984   
6    7  0.844792     0.847532   
7    8  0.849211     0.842378   
8    9  0.843966     0.847769   
9   10  0.846867     0.849797   
10  11  0.846641     0.845759   
11  12  0.859530     0.859747   
12  13  0.857686     0.860882   
13  14  0.856598     0.862971   
14  15  0.858977     0.861882   
15  16  0.858542     0.862083   
16  17  0.861727     0.867937   
17  18  0.862100     0.869202   
18  19  0.861773     0.865401   
19  20  0.866018     0.864912   
20  21  0.864047     0.869102   
21  22  0.869272     0.867329   
22  23  0.862798     0.874992   
23  24  0.865342     0.874989   

                                              columns  \
0                                       [OverallQual]   
1                            [OverallQual, GrLivArea]   
2  

In [None]:
# ls.to_csv('raw_data.csv', index=False)

In [52]:
model_selector = Pipeline(
        steps=[("preprocessor", preprocessor),
        ("feature", SelectKBest(f_regression,k=3)),
        ("classifier", RandomForestRegressor())]
    )
model_selector.fit(xtrain,ytrain)
model_selector.fit(xtrain,ytrain)
ypred=model_selector.predict(xtest)
result=r2_score(ytest,ypred)
print(f"R2score is {result}")

R2score is 0.8018320624839805


In [None]:
data=pd.read_csv("raw_data.csv")

In [None]:
data.head(23)

Unnamed: 0,i,Error,Error_model,columns,parameter
0,1,0.682879,0.680803,['OverallQual'],"{'classifier__criterion': 'absolute_error', 'c..."
1,2,0.747902,0.769135,"['OverallQual', 'GrLivArea']","{'classifier__criterion': 'absolute_error', 'c..."
2,3,0.79609,0.817659,"['OverallQual', 'GrLivArea', 'GarageCars']","{'classifier__criterion': 'absolute_error', 'c..."
3,4,0.820368,0.828525,"['OverallQual', 'GrLivArea', 'GarageCars', 'Ga...","{'classifier__criterion': 'absolute_error', 'c..."
4,5,0.840456,0.845542,"['OverallQual', 'TotalBsmtSF', 'GrLivArea', 'G...","{'classifier__criterion': 'poisson', 'classifi..."
5,6,0.837125,0.842984,"['OverallQual', 'TotalBsmtSF', 'GrLivArea', 'G...","{'classifier__criterion': 'poisson', 'classifi..."
6,7,0.844792,0.847532,"['OverallQual', 'TotalBsmtSF', '1stFlrSF', 'Gr...","{'classifier__criterion': 'poisson', 'classifi..."
7,8,0.849211,0.842378,"['OverallQual', 'TotalBsmtSF', '1stFlrSF', 'Gr...","{'classifier__criterion': 'squared_error', 'cl..."
8,9,0.843966,0.847769,"['OverallQual', 'TotalBsmtSF', '1stFlrSF', 'Gr...","{'classifier__criterion': 'poisson', 'classifi..."
9,10,0.846867,0.849797,"['OverallQual', 'TotalBsmtSF', '1stFlrSF', 'Gr...","{'classifier__criterion': 'poisson', 'classifi..."


In [59]:
print(df.columns.tolist())

['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC', 'Fen

In [88]:
cols1=['OverallQual', 'YearBuilt', 'YearRemodAdd', 'TotalBsmtSF', '1stFlrSF', 'GrLivArea', 'FullBath','TotRmsAbvGrd', 'GarageCars', 'GarageArea', 'ExterQual', 'Foundation', 'BsmtQual',  'KitchenQual', 'GarageFinish']


In [89]:
X = df[cols1]
xtrain,xtest,ytrain,ytest=train_test_split(X, y ,test_size=0.2,random_state=45)

In [90]:
xtrain.head()

Unnamed: 0,OverallQual,YearBuilt,YearRemodAdd,TotalBsmtSF,1stFlrSF,GrLivArea,FullBath,TotRmsAbvGrd,GarageCars,GarageArea,ExterQual,Foundation,BsmtQual,KitchenQual,GarageFinish
1075,7,1940,1984,800,960,1740,1,6,1,240,TA,CBlock,TA,TA,Unf
372,6,1984,1984,744,752,752,1,4,1,264,TA,CBlock,Gd,TA,Unf
502,5,1965,1965,1214,1214,1214,1,6,2,461,TA,CBlock,TA,TA,Unf
924,6,1980,1980,1686,1686,1686,2,7,2,612,TA,CBlock,Gd,TA,Unf
662,6,1968,1968,1392,1392,1392,1,5,2,576,TA,CBlock,Fa,TA,RFn


In [91]:
numeric_transformer = Pipeline(steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())])

categorical_transformer = OneHotEncoder(handle_unknown="ignore")
preprocessor = ColumnTransformer(
            transformers=[
                ("numeric", numeric_transformer, X.select_dtypes(np.number).columns.tolist()),
                ("category", categorical_transformer,X.select_dtypes("object").columns.tolist())])
model_selector_random = Pipeline(
        steps=[("preprocessor", preprocessor),
        ("classifier", RandomForestRegressor())])

In [96]:

model_selector_random.fit(xtrain,ytrain)
ypred=model_selector_random.predict(xtest)
result=r2_score(ytest,ypred)
print(f"R2score is {result}")

R2score is 0.855963468772047


In [92]:
model_selector_random

In [93]:
xtrain.columns.tolist()

['OverallQual',
 'YearBuilt',
 'YearRemodAdd',
 'TotalBsmtSF',
 '1stFlrSF',
 'GrLivArea',
 'FullBath',
 'TotRmsAbvGrd',
 'GarageCars',
 'GarageArea',
 'ExterQual',
 'Foundation',
 'BsmtQual',
 'KitchenQual',
 'GarageFinish']

In [94]:
model_selector_random.fit(xtrain,ytrain)

In [95]:
model_selector_random.get_params()

{'memory': None,
 'steps': [('preprocessor',
   ColumnTransformer(transformers=[('numeric',
                                    Pipeline(steps=[('imputer',
                                                     SimpleImputer(strategy='median')),
                                                    ('scaler', StandardScaler())]),
                                    ['OverallQual', 'YearBuilt', 'YearRemodAdd',
                                     'TotalBsmtSF', '1stFlrSF', 'GrLivArea',
                                     'FullBath', 'TotRmsAbvGrd', 'GarageCars',
                                     'GarageArea']),
                                   ('category',
                                    OneHotEncoder(handle_unknown='ignore'),
                                    ['ExterQual', 'Foundation', 'BsmtQual',
                                     'KitchenQual', 'GarageFinish'])])),
  ('classifier', RandomForestRegressor())],
 'verbose': False,
 'preprocessor': ColumnTransformer(transformer