In [None]:
import pandas as pd
import itertools
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import Ridge
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import numpy as np
import warnings
from sklearn.metrics import r2_score
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoLarsCV
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn import linear_model

# Suppress warnings
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv('final_converted_out.csv')
df.head()

In [None]:
categorical_columns = df.select_dtypes(include=['object']).columns.tolist()
print(categorical_columns)
encoder = OneHotEncoder(sparse_output=False)
one_hot_encoded = encoder.fit_transform(df[categorical_columns])
one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(categorical_columns))
df_encoded = pd.concat([df, one_hot_df], axis=1)
df_encoded = df_encoded.drop(categorical_columns, axis=1)
df.head()

In [None]:
X_orig = df_encoded.drop('average_dollar_price', axis=1)
y_orig = df_encoded['average_dollar_price']
print(X_orig.shape)
print(y_orig.shape)

In [None]:
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp.fit(X_orig)
X = imp.transform(X_orig)
X = pd.DataFrame(X, columns = X_orig.columns)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y_orig , random_state=104,test_size=0.25, shuffle=True)

# Display training inputs
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
# Fit one feature

X_train_display_area = X_train["display_area_cm2"].values.reshape(-1, 1)
model = LinearRegression()
model.fit(X_train_display_area, y_train)
r2 = r2_score(y_train, model.predict(X_train_display_area))
print(r2)

# plot
x_grid = np.arange(X_train_display_area.min(), X_train_display_area.max(), 0.1).reshape(-1, 1)
y_grid = model.predict(x_grid)
plt.scatter(X_train_display_area, y_train)
plt.plot(x_grid, y_grid)

# Label the plot and display R-squared score
plt.xlabel('display_area')
plt.ylabel('price')
plt.title(f'Linear Regression - display_area against price with R^2 = {round(r2,2)}')
plt.grid(True)
plt.show()

In [None]:
def findsubsets(S,k):
    return set(itertools.combinations(S, k))

In [None]:
subsets3 = findsubsets(X, 1)
best_score3 = -np.Inf
best_features3 = []
for feats in subsets3:
  this_X = X_train[list(feats)]
  cvs = cross_val_score(linear_model.LinearRegression(), 
                             this_X, y_train, cv=5, scoring='r2')
  this_score = cvs.mean()
  if this_score > best_score3:
    best_score3 = this_score
    best_features3 = feats

# report
print('Best R2: ', best_score3)
print('Best Features: ', best_features3)