In [3]:

# Import necessary libraries
import pandas as pd #handles data in table format
from sklearn.model_selection import train_test_split #splits data into training and testing sets
from sklearn.linear_model import LinearRegression #a simple mathematical model that finds patterns in data.
from sklearn.metrics import r2_score #Measures how well the model predicts values
from sklearn.preprocessing import LabelEncoder #convert text data into numbers

# Step 1: Load the dataset
data = pd.read_csv('train (1).csv')  # Replace with your filename if running locally

# Step 2: Data Cleaning - Handle missing values

# Drop columns with more than 30% missing values
missing_percent = (data.isnull().sum() / len(data)) * 100 #calcualte missing value percentage
high_missing_cols = missing_percent[missing_percent > 30].index #identify columns with >30% missing data
data = data.drop(columns=high_missing_cols) #remove those columns

# Fill numeric missing values with median
numeric_cols = data.select_dtypes(include=['int64', 'float64']).columns #identify numeric columns
data[numeric_cols] = data[numeric_cols].fillna(data[numeric_cols].median()) #fill missing values with median

# Fill categorical missing values with mode
categorical_cols = data.select_dtypes(include=['object']).columns #Identify categorical columns
data[categorical_cols] = data[categorical_cols].fillna(data[categorical_cols].mode().iloc[0]) #fill with mode

# Step 3: Encode categorical features using LabelEncoder
le = LabelEncoder() #create a label encoder
for col in categorical_cols: #llop through all categorical columns
    # Check if the column is in the DataFrame before encoding
    if col in data.columns:
        data[col] = le.fit_transform(data[col]) #convert text to numbers

# Step 4: Prepare features and target variable
X = data.drop(columns=['SalePrice', 'Id'])  # Drop 'Id' column
y = data['SalePrice'] #target variable

# Step 5: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) #The dataset is split into 80% training and 20% testing:


# Step 6: Train the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train) #Train the model using the training set

# Step 7: Make predictions and evaluate the model
y_pred = model.predict(X_test) #predict house prices using the test set
r2 = r2_score(y_test, y_pred) #calculate the R^2 score to evaluate the model accuracy

print("R^2 Score of the Regression Model:", r2)


R^2 Score of the Regression Model: 0.8454492563418252
