In [1]:
%pip install sklearn scikit-learn

Collecting sklearn
  Using cached sklearn-0.0.post10-py3-none-any.whl
Collecting scikit-learn
  Downloading scikit_learn-1.3.1-cp311-cp311-win_amd64.whl (9.2 MB)
                                              0.0/9.2 MB ? eta -:--:--
                                              0.0/9.2 MB 1.3 MB/s eta 0:00:08
                                              0.0/9.2 MB 653.6 kB/s eta 0:00:15
                                              0.1/9.2 MB 930.9 kB/s eta 0:00:10
                                              0.2/9.2 MB 1.0 MB/s eta 0:00:09
     -                                        0.2/9.2 MB 958.6 kB/s eta 0:00:10
     -                                        0.3/9.2 MB 1.2 MB/s eta 0:00:08
     --                                       0.5/9.2 MB 1.4 MB/s eta 0:00:07
     --                                       0.7/9.2 MB 1.7 MB/s eta 0:00:05
     ---                                      0.8/9.2 MB 1.8 MB/s eta 0:00:05
     ----                                     1.0/9.2 MB 2.


[notice] A new release of pip is available: 23.1.2 -> 23.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import Ridge
import matplotlib.pyplot as plt


# Linear Regression

In [2]:
#load training data
train_data = pd.read_csv('Train.csv')

train_data = pd.get_dummies(train_data, columns=['name' ,'fuel','seller_type','transmission', 'owner']) #one-hot encoding for categorical features


X_train = train_data.drop('selling_price', axis=1)
y_train = train_data['selling_price']

In [3]:
#load testing data
test_data = pd.read_csv('Test.csv')


test_data = pd.get_dummies(test_data, columns=[ 'name', 'fuel', 'seller_type', 'transmission', 'owner']) #one-hot encoding for categorical features



In [4]:
# Align columns between training and testing data
X_train, X_test = X_train.align(test_data, join='outer', axis=1, fill_value=0)
y_test = test_data['selling_price']





## Multi-feature Equation


In [5]:

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Create and fit a Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = model.predict(X_test)

# Calculate evaluation metrics
MAE = mean_absolute_error(y_test, y_pred)
MSE = mean_squared_error(y_test, y_pred)
RMSE = np.sqrt(MSE)
r2 = r2_score(y_test, y_pred)

# Print the model coefficients and evaluation metrics
print("Intercept (b0):", model.intercept_)
print("Coefficients (b1, b2, ...):", model.coef_)
print("Mean Squared Error (MSE):", MSE)
print("MAE:", MAE)
print("RMSE:", RMSE)
print("R^2:", r2)


Intercept (b0): 638297.9355394155
Coefficients (b1, b2, ...): [ 4.43511613e+16 -1.18980328e+17  7.83378174e+16 ... -2.55805178e+18
 -1.03879434e+18  1.31968000e+05]
Mean Squared Error (MSE): 3.26545488565392e+36
MAE: 3.7272910256649485e+17
RMSE: 1.8070569680156516e+18
R^2: -9.758068648827145e+24


## Gradient Descent


In [6]:
# Gradient Descent Approach
def gradient_descent(X, y, learning_rate, n_iterations):
    m = y.size
    theta = np.zeros(X.shape[1])
    for _ in range(n_iterations):
        gradient = 2/m * X.T.dot(X.dot(theta) - y)
        theta = theta - learning_rate * gradient
    return theta

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

theta_train = gradient_descent( X_train_scaled, y_train, learning_rate=0.001, n_iterations=500)

# Create a Ridge regression object
ridge = Ridge(alpha=2.0)

# Fit the model to the training data
ridge.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = ridge.predict(X_test_scaled)

# Calculate error metrics
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2grad = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)



# Print the model coefficients and evaluation metrics
print("Intercept (b0):", ridge.intercept_)
print(f"Coefficients (β):: {theta_train}")
print("Mean Squared Error (MSE):", mse)
print("MAE:", mae)
print("RMSE:", rmse)
print("R^2:", r2grad)

Intercept (b0): 638271.8077017716
Coefficients (β):: [  -7474.0432867    55886.78427344       0.         ...  127169.59244343
 -127169.59244343   92216.15817894]
Mean Squared Error (MSE): 129289850721.75423
MAE: 145719.30951818792
RMSE: 359568.9790871207
R^2: 0.6136467098405962


# Polynomial Regression

In [9]:
## Optimize data to avoid memory error
# Select top k features
selector = SelectKBest(f_regression, k=10)
X_train_scaled_new = selector.fit_transform(X_train_scaled, y_train)
X_test_scaled_new = selector.transform(X_test_scaled)

In [10]:

# Initialize lists to store degrees and corresponding RMSE values
degrees = []*10
rmse_values = []*10

# Create PolynomialFeatures object with current degree
poly_features = PolynomialFeatures()


# Loop over degrees from 1 to 10
for degree in range(1, 11):
    poly_features.degree = degree
    
    # Transform the features to polynomial features
    X_train_poly = poly_features.fit_transform(X_train_scaled_new)
    X_test_poly = poly_features.transform(X_test_scaled_new)
    
    # Create a Linear Regression object
    poly_reg = LinearRegression()
    
    # Fit the model to the training data
    poly_reg.fit(X_train_poly, y_train)
    
    # Make predictions on the test set
    y_pred = poly_reg.predict(X_test_poly)
    
    # Calculate RMSE and append to list
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    degrees.append(degree)
    rmse_values.append(rmse)

# Plot degree vs RMSE
plt.figure(figsize=(10, 6))
plt.plot(degrees, rmse_values, marker='o')
plt.xlabel('Degree of Polynomial')
plt.ylabel('Root Mean Squared Error (RMSE)')
plt.title('Polynomial Regression: Degree vs RMSE')
plt.grid(True)
plt.show()