In [2]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.preprocessing import LabelEncoder

# Step 1: Load the dataset
data = pd.read_csv('train (1).csv')  # Replace with your filename if running locally

# Step 2: Data Cleaning - Handle missing values
# Drop columns with more than 30% missing values
missing_percent = (data.isnull().sum() / len(data)) * 100
high_missing_cols = missing_percent[missing_percent > 30].index
data = data.drop(columns=high_missing_cols)

# Fill numeric missing values with median
numeric_cols = data.select_dtypes(include=['int64', 'float64']).columns
data[numeric_cols] = data[numeric_cols].fillna(data[numeric_cols].median())

# Fill categorical missing values with mode
categorical_cols = data.select_dtypes(include=['object']).columns
data[categorical_cols] = data[categorical_cols].fillna(data[categorical_cols].mode().iloc[0])

# Step 3: Encode categorical features using LabelEncoder
le = LabelEncoder()
for col in categorical_cols:
    if col in data.columns:
        data[col] = le.fit_transform(data[col])

# Step 4: Prepare features and target variable
X = data.drop(columns=['SalePrice', 'Id'])  # Drop 'Id' column
y = data['SalePrice']

# Step 5: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Train the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Step 7: Make predictions and evaluate the model
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)

print("R^2 Score of the Regression Model:", r2)


R^2 Score of the Regression Model: 0.8454492563418252
