# Import Necessary libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# Load 
df = pd.read_csv('../data/train_v6.csv')
df.head()

Unnamed: 0,PC1,PC2,price
0,-0.366039,0.476383,9279.0
1,4.970104,0.26879,22563.0
2,-0.78673,0.023457,9995.0
3,-0.441181,0.414536,11259.0
4,1.863872,0.750249,15750.0


# Spliting Data

In [3]:
# Create a sample dataset
features = df.drop(columns=['price'])
target = df['price']

# Models

Regression is a supervised learning task used for predicting a continuous numeric value. There are several regression models available in Python. The choice of the best regression model depends on your specific dataset and problem. Here are some commonly used regression models with Python examples:
1. <b>Linear Regression:</b><br>
    Linear regression is a simple and widely used model that assumes a linear relationship between the features and the target variable.

In [4]:
from sklearn.linear_model import LinearRegression

In [5]:
# # Fit a linear regression model
model = LinearRegression()
model.fit(features, target)

In [6]:
# Make predictions
y_pred = model.predict(features)

In [7]:
from sklearn.metrics import mean_squared_error

In [8]:
# Calculate and print the RMSE
rmse = np.sqrt(mean_squared_error(target, y_pred))
print(f"Root Mean Squared Error: {rmse}")

Root Mean Squared Error: 2647.0125175088956


2. <b>Ridge Regression:</b><br>
    Ridge regression is a linear regression variant that adds L2 regularization to the cost function. It's useful for handling multicollinearity and preventing overfitting.

In [38]:
from sklearn.linear_model import Ridge

# Fit a Ridge regression model
model = Ridge(alpha=1.0)  # You can adjust the alpha (regularization strength)
model.fit(features, target)

# Make predictions
y_pred = model.predict(features)

# Calculate and print the RMSE
rmse = np.sqrt(mean_squared_error(target, y_pred))
print(f"Root Mean Squared Error: {rmse}")

Root Mean Squared Error: 2647.094775828543


3. <b>Lasso Regression:</b><br>
    Lasso regression is another linear regression variant that adds L1 regularization to the cost function. It helps with feature selection and can set some feature coefficients to zero.

In [37]:
from sklearn.linear_model import Lasso

# Fit a Lasso regression model
model = Lasso(alpha=0.01)  # You can adjust the alpha (regularization strength)
model.fit(features, target)

# Make predictions
y_pred = model.predict(features)

# Calculate and print the RMSE
rmse = np.sqrt(mean_squared_error(target, y_pred))
print(f"Root Mean Squared Error: {rmse}")

Root Mean Squared Error: 2647.0125175406306


4. <b>Decision Tree Regression:</b><br>
    Decision tree regression can capture complex non-linear relationships between features and the target variable. It partitions the feature space into regions and predicts the average of the target values in each region.

In [39]:
from sklearn.tree import DecisionTreeRegressor

# Fit a Decision Tree regression model
model = DecisionTreeRegressor(max_depth=75)  # You can adjust the max depth
model.fit(features, target)

# Make predictions
y_pred = model.predict(features)

# Calculate and print the RMSE
rmse = np.sqrt(mean_squared_error(target, y_pred))
print(f"Root Mean Squared Error: {rmse}")

Root Mean Squared Error: 318.60875452745876


5. <b>Random Forest Regression:</b><br>
    Random Forest is an ensemble method that combines multiple decision trees to improve predictive accuracy and reduce overfitting.

In [40]:
from sklearn.ensemble import RandomForestRegressor

# Fit a Random Forest regression model
model = RandomForestRegressor(n_estimators=100)  # You can adjust the number of trees
model.fit(features, target)

# Make predictions
y_pred = model.predict(features)

# Calculate and print the RMSE
rmse = np.sqrt(mean_squared_error(target, y_pred))
print(f"Root Mean Squared Error: {rmse}")

Root Mean Squared Error: 979.0146634212703


6. <b>Support Vector Regression (SVR):</b><br>
    Support Vector Regression is based on support vector machines and is suitable for both linear and non-linear regression tasks.

In [41]:
from sklearn.svm import SVR

# Fit a Support Vector Regression model
model = SVR(kernel='linear', C=1.0)  # You can adjust the kernel and C parameter
model.fit(features, target)

# Make predictions
y_pred = model.predict(features)

# Calculate and print the RMSE
rmse = np.sqrt(mean_squared_error(target, y_pred))
print(f"Root Mean Squared Error: {rmse}")

Root Mean Squared Error: 6390.212685503681


Certainly! Here are some more regression models that you can consider for your machine learning tasks:

1. <b style="color: #B15EFF;">Gradient Boosting Regressor:</b><br>
    Gradient Boosting is an ensemble method that builds an additive model in a forward stage-wise manner. It can handle complex non-linear relationships and is often a top performer in regression tasks.

In [43]:
from sklearn.ensemble import GradientBoostingRegressor

# Fit a Gradient Boosting regression model
model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1)  # Adjust hyperparameters
model.fit(features, target)

# Make predictions
y_pred = model.predict(features)

# Calculate and print the RMSE
rmse = np.sqrt(mean_squared_error(target, y_pred))
print(f"Root Mean Squared Error: {rmse}")

Root Mean Squared Error: 370.45547330504996


2. <b style="color: #0C356A;">AdaBoost Regressor:</b><br>
    AdaBoost is another ensemble method that combines multiple weak learners to create a strong learner. It's particularly effective for regression tasks.

In [44]:
from sklearn.ensemble import AdaBoostRegressor

# Fit an AdaBoost regression model
model = AdaBoostRegressor(n_estimators=100, learning_rate=0.1)  # Adjust hyperparameters
model.fit(features, target)

# Make predictions
y_pred = model.predict(features)

# Calculate and print the RMSE
rmse = np.sqrt(mean_squared_error(target, y_pred))
print(f"Root Mean Squared Error: {rmse}")

Root Mean Squared Error: 1056.0473967940486


3. <b style="color: #DA0C81;">XGBoost Regressor:</b><br>
    XGBoost is a highly optimized gradient boosting library that often outperforms other models in regression tasks. It offers excellent predictive accuracy.

In [None]:
import xgboost as xgb

# Fit an XGBoost regression model
model = xgb.XGBRegressor(n_estimators=100, learning_rate=0.1)  # Adjust hyperparameters
model.fit(features, target)

# Make predictions
y_pred = model.predict(features)

# Calculate and print the RMSE
rmse = np.sqrt(mean_squared_error(target, y_pred))
print(f"Root Mean Squared Error: {rmse}")

4. <b style="color: #CD5C08;">K-Nearest Neighbors (KNN) Regressor:</b><br>
    KNN is a non-parametric method that makes predictions based on the average of the K nearest data points. It's simple and flexible.

In [None]:
from sklearn.neighbors import KNeighborsRegressor

# Create a sample dataset
# (same as in the linear regression example)

# Fit a KNN regression model
model = KNeighborsRegressor(n_neighbors=5)  # Adjust the number of neighbors (K)
model.fit(features, target)

# Make predictions
y_pred = model.predict(features)

# Calculate and print the RMSE
rmse = np.sqrt(mean_squared_error(target, y_pred))
print(f"Root Mean Squared Error: {rmse}")