##1.&nbsp;Import Data from Google Drive

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

url = "https://drive.google.com/file/d/1-PrhRxZgo-UOFKTWV7AAE7poam6pJ2wv/view?usp=drive_link" # regression_model
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
data = pd.read_csv(path)

#data.info()

In [None]:
y = data.pop("SalePrice") # SalePrice is what we want to predict

In [None]:
data = data.drop("Id", axis=1)

In [None]:
#y

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.2, random_state=42)

In [None]:
#X_train.info()

In [None]:
X_train.isnull().sum()

MSSubClass         0
MSZoning           0
LotFrontage      217
LotArea            0
Street             0
                ... 
MiscVal            0
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
Length: 79, dtype: int64

## the  dataframe has missing  values, so we have to Impute them

In [None]:
X_train.head(3)


Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
254,20,RL,70.0,8400,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,6,2010,WD,Normal
1066,60,RL,59.0,7837,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,5,2009,WD,Normal
638,30,RL,67.0,8777,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,MnPrv,,0,5,2008,WD,Normal


##2.&nbsp; Create a Dummy Model

In [None]:
from sklearn.metrics import r2_score

# Create a dummy model that always predicts SalePrice > $150,000
def dummy_model(X):
    return [1 if x >= 150000 else 0 for x in X]

# Make predictions using the dummy model on the training data
dummy_y_train_pred = dummy_model(y_train)

# Calculate the R-squared score for the dummy model on the training data
dummy_r2 = r2_score(y_true=y_train, y_pred=dummy_y_train_pred)

# Print or use 'dummy_r2_train' to evaluate the dummy model's performance on the training data
print("Dummy R-squared (R2) Score on Training Data:", dummy_r2)




Dummy R-squared (R2) Score on Training Data: -5.519404800266533


In [None]:
#import seaborn as sns
#import matplotlib.pyplot as plt

# Create a DataFrame to store the errors on the training data
#dummy_errors_train_df = X_train.copy()
#dummy_errors_train_df["price"] = y_train
#dummy_errors_train_df["prediction"] = dummy_y_train_pred
#dummy_errors_train_df["error"] = dummy_errors_train_df["prediction"] - dummy_errors_train_df["price"]

# Create a figure with a specified aspect ratio
#plt.figure(figsize=(10, 6))

# Create a histogram of the errors
#sns.histplot(dummy_errors_train_df['error'], bins=30, kde=True, color='skyblue')

# Add a vertical line at zero to indicate perfect predictions
#plt.vlines(x=0, ymin=0, ymax=600, color='red')
#plt.xlim(-5000, 5000)

# Set labels and title
#plt.xlabel("Error (Prediction - Actual Price) on Training Data")
#plt.ylabel("Frequency")
#plt.title("Histogram of Dummy Model Errors on Training Data")

# Show the plot
#plt.show()


##  3.&nbsp;Implement the  Pipeline building the preprocessor, the numerical and  categorical data

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression, SGDRegressor

In [None]:
# building the pipeline
X_cat = X_train.select_dtypes(exclude="number").copy() # will have all the string-columns
X_num = X_train.select_dtypes(include="number").copy() # will have all number columns


In [None]:
numeric_pipe = make_pipeline(
    SimpleImputer(strategy="mean"))


In [None]:
categoric_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="N_A"),
    OneHotEncoder(handle_unknown="ignore")
)

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num_pipe", numeric_pipe, X_num.columns),
        ("cat_pipe", categoric_pipe, X_cat.columns),
    ]
)

##4.&nbsp; Build the  whole model

In [None]:
# make the Pipeline for DecisionTreeRegressor
# Define the preprocessors for numerical and categorical features

# Define the parameter grid for hyperparameter tuning
#param_grid = {
 #   "decisiontreeregressor__max_depth": range(2, 14, 2),
 #   "decisiontreeregressor__min_samples_leaf": range(3, 12, 2)
#}
#In the param_grid dictionary, you can modify the range values for max_depth and min_samples_leaf to specify the ranges and step sizes for these hyperparameters that you want to search over during the grid search. The range function generates a sequence of values within the specified range with the specified step size.

#For example, with range(2, 14, 2),
#it starts at 2 and goes up to (but not including) 14 in steps of 2, so it will search for max_depth values of 2, 4, 6, 8, 10, and 12. Similarly, with range(1, 12, 2), it starts at 1 and goes up to (but not including) 12 in steps of 2,
#so it will search for min_samples_leaf values of 1, 3, 5, 7, 9, and 11.
#You can modify these ranges and step sizes according to your specific requirements for hyperparameter tuning.

#min_samples_leaf: This hyperparameter specifies the minimum number
#of samples required to create a leaf node in the decision tree.
#This hyperparameter specifies the minimum number of samples required
#to create a leaf node in the decision tree.

# Create the pipeline with preprocessing (without StandardScaler) and decision tree regression
pipeline = make_pipeline(
    preprocessor,
    DecisionTreeRegressor()
)

# Define the parameter grid for hyperparameter tuning
param_grid = {
    "decisiontreeregressor__max_depth": range(2, 14, 2),
    "decisiontreeregressor__min_samples_leaf": range(3, 12, 2)  # Modify min_samples_leaf range
}

# Initialize the GridSearchCV
search = GridSearchCV(pipeline, param_grid, cv=5, verbose=1)

# Fit the GridSearchCV to your training data
search.fit(X_train, y_train)

# Get the best hyperparameters from the search
best_params = search.best_params_
print("Best Hyperparameters:", best_params)


Fitting 5 folds for each of 30 candidates, totalling 150 fits
Best Hyperparameters: {'decisiontreeregressor__max_depth': 12, 'decisiontreeregressor__min_samples_leaf': 9}


In [None]:
# fit the search
search.fit(X_train, y_train)



Fitting 5 folds for each of 30 candidates, totalling 150 fits


In [None]:
# predict with search
y_train_pred = search.predict(X_train)

In [None]:
from sklearn.metrics import r2_score

# Calculate R-squared (R2) score
dt_r2 = r2_score(y_true=y_train, y_pred=y_train_pred)

dt_r2

0.8801321892527421

##5.&nbsp;Create the pipelines for decision tree, SGDRegressor, and LinearRegression


In [None]:
from sklearn.linear_model import SGDRegressor

# Make the Pipeline for SGDRegressor
sgd_pipeline = make_pipeline(
    preprocessor,
    SGDRegressor()
)

# Define the parameter grid for hyperparameter tuning
sgd_param_grid = {
    "sgdregressor__alpha": [0.0001, 0.001, 0.01, 0.1, 1.0],  # Regularization strength
    "sgdregressor__penalty": ["l1", "l2", "elasticnet"],  # Regularization type
    "sgdregressor__max_iter": [1000, 2000, 3000],  # Maximum number of iterations
    "sgdregressor__random_state": [42]  # Random seed for reproducibility
}

# Initialize the GridSearchCV for SGDRegressor
sgd_search = GridSearchCV(sgd_pipeline, sgd_param_grid, cv=5, verbose=1)


# fit the search
sgd_search.fit(X_train, y_train)

# 3. predict with search
y_train_pred = sgd_search.predict(X_train)

from sklearn.metrics import r2_score

# Calculate R-squared (R2) score
sgd_r2 = r2_score(y_true=y_train, y_pred=y_train_pred)

sgd_r2

Fitting 5 folds for each of 45 candidates, totalling 225 fits


-7.92989411037327e+23

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# Make the Pipeline for LinearRegression
lr_pipeline = make_pipeline(
    preprocessor,
    LinearRegression()
)

# Initialize the GridSearchCV for LinearRegression without a parameter grid
lr_search = GridSearchCV(lr_pipeline, param_grid={}, cv=5, verbose=1)

# Fit the search
lr_search.fit(X_train, y_train)

# Predict with the best model from the search
y_train_pred = lr_search.predict(X_train)

# Calculate R-squared (R2) score
lr_r2 = r2_score(y_true=y_train, y_pred=y_train_pred)

lr_r2


Fitting 5 folds for each of 1 candidates, totalling 5 fits


0.9071199100323454

In [None]:
# Create a DataFrame with R-squared scores
r2_scores_df = pd.DataFrame({"R2": [dummy_r2, dt_r2, sgd_r2, lr_r2]},
              #              index=["dumb_model", "decision_tree", "sgd", "linear_regression"])

 Print or use 'r2_scores_df' as needed
print(r2_scores_df)

## 6. Using the models to predict values for unseen data

In [None]:
# we will now use the search-object which learned everything before - to apply it to our test-data
lr_search

In [None]:
# get the test-data
test_url = "https://drive.google.com/file/d/1MZnPvWoGQtBHij32Rti26C2T0KT1xGBc/view?usp=drive_link"
test_path = 'https://drive.google.com/uc?export=download&id='+test_url.split('/')[-2]
test = pd.read_csv(test_path)

In [None]:
# check out the data (data types, missing values)
#test.info()

In [None]:
# set the id as index. have a look at the  Kaggle  Platform
test = test.set_index("Id")

In [None]:
# the Id info is now in the index of the dataframe
test

Unnamed: 0_level_0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch,MSZoning,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1461,11622,80.0,882.0,2,0,0,1.0,140,120,RH,...,Attchd,Unf,TA,TA,Y,,MnPrv,,WD,Normal
1462,14267,81.0,1329.0,3,0,0,1.0,393,0,RL,...,Attchd,Unf,TA,TA,Y,,,Gar2,WD,Normal
1463,13830,74.0,928.0,3,1,0,2.0,212,0,RL,...,Attchd,Fin,TA,TA,Y,,MnPrv,,WD,Normal
1464,9978,78.0,926.0,3,1,0,2.0,360,0,RL,...,Attchd,Fin,TA,TA,Y,,,,WD,Normal
1465,5005,43.0,1280.0,2,0,0,2.0,0,144,RL,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2915,1936,21.0,546.0,3,0,0,0.0,0,0,RM,...,,,,,Y,,,,WD,Normal
2916,1894,21.0,546.0,3,0,0,1.0,0,0,RM,...,CarPort,Unf,TA,TA,Y,,,,WD,Abnorml
2917,20000,160.0,1224.0,4,1,0,2.0,474,0,RL,...,Detchd,Unf,TA,TA,Y,,,,WD,Abnorml
2918,10441,62.0,912.0,3,0,0,0.0,80,0,RL,...,,,,,Y,,MnPrv,Shed,WD,Normal


In [None]:
# we have to use .predict
# here we stored the result in a new column of the dataframe
test["SalePrice"] = lr_search.predict(test)

In [None]:
#test.info()

In [None]:
# we need two columns "Id" and "SalePrice" and store it as csv
test = test.reset_index()

In [None]:
test[["Id", "SalePrice"]].to_csv("./submission.csv", index=False)