In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [16]:
df = pd.read_csv("kc_house_data.csv")



In [17]:
df.dtypes

id                 int64
date              object
price            float64
bedrooms           int64
bathrooms        float64
sqft_living        int64
sqft_lot           int64
floors           float64
waterfront         int64
view               int64
condition          int64
grade              int64
sqft_above       float64
sqft_basement      int64
yr_built           int64
yr_renovated       int64
zipcode            int64
lat              float64
long             float64
sqft_living15      int64
sqft_lot15         int64
dtype: object

In [18]:
df
df = df.drop(['id','date'], axis = 1)
# Drop rows with missing values
df.dropna(inplace=True)



#shuffling
df = df.sample(frac=1, random_state=42)  # frac=1 shuffles the entire dataset


#categoral values
df = pd.get_dummies(df, columns=['waterfront', 'view', 'condition'])
from sklearn.preprocessing import LabelEncoder


df['view_0'] = df['view_0'].astype(int)
df['view_1'] = df['view_1'].astype(int)
df['view_2'] = df['view_2'].astype(int)
df['view_3'] = df['view_3'].astype(int)
df['view_4'] = df['view_4'].astype(int)
df['condition_1'] = df['condition_1'].astype(int)
df['condition_2'] = df['condition_2'].astype(int)
df['condition_3'] = df['condition_3'].astype(int)
df['condition_4'] = df['condition_4'].astype(int)
df['condition_5'] = df['condition_5'].astype(int)

label_encoder = LabelEncoder()
df['grade'] = label_encoder.fit_transform(df['grade'])
df.dtypes




price            float64
bedrooms           int64
bathrooms        float64
sqft_living        int64
sqft_lot           int64
floors           float64
grade              int64
sqft_above       float64
sqft_basement      int64
yr_built           int64
yr_renovated       int64
zipcode            int64
lat              float64
long             float64
sqft_living15      int64
sqft_lot15         int64
waterfront_0        bool
waterfront_1        bool
view_0             int32
view_1             int32
view_2             int32
view_3             int32
view_4             int32
condition_1        int32
condition_2        int32
condition_3        int32
condition_4        int32
condition_5        int32
dtype: object

In [19]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df.drop('price', axis=1))
y_scaled = scaler.fit_transform(np.array(df['price']).reshape(-1, 1))


In [None]:
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LinearRegression

# Define your feature matrix (X) and target variable (y)
X = df.drop('price', axis=1)
y = df['price']

# Create a linear regression model
model = LinearRegression()

# Create the RFECV selector
selector = RFECV(model, cv=5)

# Fit the selector to your data
selector.fit(X, y)

# Get the best combination of features
best_feature_combination = list(X.columns[selector.support_])
print( best_feature_combination)


X = df[best_feature_combination]
X_scaled = scaler.fit_transform(X)
y = df['price']


In [20]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.2, random_state=97107)

In [21]:
model = LinearRegression()

model.fit(X_train, y_train)


In [22]:

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R2) Score: {r2}")



Mean Squared Error (MSE): 0.3217293086222768
R-squared (R2) Score: 0.6919210232380997


In [23]:
df.corr()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,grade,sqft_above,sqft_basement,yr_built,...,view_0,view_1,view_2,view_3,view_4,condition_1,condition_2,condition_3,condition_4,condition_5
price,1.0,0.308366,0.52515,0.702055,0.089661,0.256814,0.667529,0.605567,0.323842,0.054006,...,-0.359125,0.092608,0.14842,0.182882,0.307934,-0.020885,-0.051916,0.007111,-0.030696,0.057587
bedrooms,0.308366,1.0,0.515974,0.576763,0.03171,0.175418,0.356929,0.477616,0.303251,0.154248,...,-0.080111,0.022202,0.045063,0.050434,0.034533,-0.036246,-0.05195,0.004927,-0.008995,0.028123
bathrooms,0.52515,0.515974,1.0,0.754684,0.08773,0.500712,0.66501,0.685363,0.283737,0.505968,...,-0.177141,0.038051,0.087199,0.112296,0.107932,-0.045497,-0.077419,0.190459,-0.166042,-0.034307
sqft_living,0.702055,0.576763,0.754684,1.0,0.172841,0.354048,0.762807,0.876644,0.434925,0.318066,...,-0.270943,0.066522,0.135305,0.158903,0.169477,-0.03507,-0.065324,0.102338,-0.083721,-0.018123
sqft_lot,0.089661,0.03171,0.08773,0.172841,1.0,-0.005206,0.113646,0.183511,0.015301,0.053061,...,-0.067842,-0.008289,0.037274,0.073869,0.01917,0.006322,0.037615,-0.011465,0.013175,-0.014502
floors,0.256814,0.175418,0.500712,0.354048,-0.005206,1.0,0.458267,0.523899,-0.245634,0.489361,...,-0.017858,-0.022726,0.009742,0.020261,0.025089,-0.023775,-0.055958,0.318104,-0.257845,-0.12056
grade,0.667529,0.356929,0.66501,0.762807,0.113646,0.458267,1.0,0.756009,0.168383,0.44704,...,-0.237362,0.048948,0.122005,0.142391,0.148044,-0.057854,-0.087732,0.196546,-0.139385,-0.083553
sqft_above,0.605567,0.477616,0.685363,0.876644,0.183511,0.523899,0.756009,1.0,-0.051976,0.423915,...,-0.153324,0.021838,0.077859,0.091662,0.107624,-0.028998,-0.058926,0.194549,-0.142477,-0.088455
sqft_basement,0.323842,0.303251,0.283737,0.434925,0.015301,-0.245634,0.168383,-0.051976,1.0,-0.133195,...,-0.275417,0.097202,0.135122,0.15827,0.150344,-0.01852,-0.025304,-0.151693,0.09288,0.127931
yr_built,0.054006,0.154248,0.505968,0.318066,0.053061,0.489361,0.44704,0.423915,-0.133195,1.0,...,0.062848,-0.034068,-0.04464,-0.018889,-0.020243,-0.050109,-0.067291,0.391662,-0.257314,-0.244402


In [24]:
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LinearRegression

# Define your feature matrix (X) and target variable (y)
X = df.drop('price', axis=1)
y = df['price']

# Create a linear regression model
model = LinearRegression()

# Create the RFECV selector
selector = RFECV(model, cv=5)

# Fit the selector to your data
selector.fit(X, y)

# Get the best combination of features
best_feature_combination = list(X.columns[selector.support_])
print( best_feature_combination)


['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15', 'waterfront_0', 'waterfront_1', 'view_0', 'view_1', 'view_2', 'view_3', 'view_4', 'condition_1', 'condition_2', 'condition_3', 'condition_4', 'condition_5']


In [25]:
# from concurrent.futures import ThreadPoolExecutor
# from sklearn.model_selection import train_test_split
# from sklearn.linear_model import LinearRegression
# from sklearn.metrics import r2_score
# from sklearn.preprocessing import StandardScaler

# # Define the features and target variable
# X = df[best_feature_combination]
# X_scaled = scaler.fit_transform(X)
# y = df['price']

# # Initialize variables to keep track of the best state and performance
# best_state = None
# best_r2 = -1  # Initialize with a negative value to ensure any R-squared score is better

# # Number of iterations or random states to try
# num_iterations = 10**5

# # Define a function to evaluate the state and write results to a file
# def evaluate_state_and_write_to_file(state):
#     X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=state)
#     model = LinearRegression()
#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_test)
#     r2 = r2_score(y_test, y_pred)

#     # Write the result to the same file in append mode
#     with open(f"result.txt", "a") as file:
#         file.write(f"{state},{r2}\n")

#     return state, r2

# with ThreadPoolExecutor() as executor:
#     executor.map(evaluate_state_and_write_to_file, range(num_iterations))



# print(f"Best Random State: {best_state}")
# print(f"Best R-squared (R2) Score: {best_r2}")


In [None]:
# def find_best_iteration(filename):
#     best_state = -1
#     best_r2 = -1  # Initialize with a negative value to ensure any R-squared score is better

#     try:
#         with open(filename, "r") as file:
#             lines = file.readlines()
#             print(len(lines))

#             for i in range(len(lines)):
#                 if lines[i].strip() != "":
#                     r2 = float(lines[i].split(",")[1].strip())
                    
#                 else:
#                     continue
#                 if best_r2<r2:
#                     best_r2=r2
                    

#     except Exception:
#         print("error")
#     finally:
#         print(best_r2)
        
                
            
                
            
        

# find_best_iteration("result.txt")


97107
0.7402509763772145
