Importing the  Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

Importing and preparing the Dataset

In [2]:
# Load the training and testing data from your Colab environment
try:
    train_df = pd.read_csv('task1_train.csv')
    test_df = pd.read_csv('Task1_test.csv')

    # --- Feature Engineering (Applied to both datasets) ---
    # Create a 'TotalBath' feature by combining all bathroom columns
    train_df['TotalBath'] = train_df['FullBath'] + 0.5 * train_df['HalfBath'] + train_df['BsmtFullBath'] + 0.5 * train_df['BsmtHalfBath']
    test_df['TotalBath'] = test_df['FullBath'] + 0.5 * test_df['HalfBath'] + test_df['BsmtFullBath'] + 0.5 * test_df['BsmtHalfBath']

    # Define the features to be used
    features = ['GrLivArea', 'TotalBath', 'BedroomAbvGr']

    # --- Handle Missing Values ---
    # Fill any missing values in the test set using the median from the training set
    for col in features:
        median_val = train_df[col].median()
        train_df[col].fillna(median_val, inplace=True)
        test_df[col].fillna(median_val, inplace=True)

    print("Datasets loaded and prepared successfully!")

except FileNotFoundError:
    print("ERROR: Please make sure 'task1_train.csv' and 'Task1_test.csv' are uploaded to your Colab environment.")

Datasets loaded and prepared successfully!


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df[col].fillna(median_val, inplace=True)


Splitting Dataset to Training and Validation

In [3]:
# Define the features (X) and the target (y) from the main training dataframe
X = train_df[features]
y = train_df['SalePrice']

# Split the data: 80% for training, 20% for validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print("Dataset successfully split into training and validation sets.")
print(f"Training set shape: {X_train.shape}")
print(f"Validation set shape: {X_val.shape}")

Dataset successfully split into training and validation sets.
Training set shape: (1168, 3)
Validation set shape: (292, 3)


Training the Linear Regression Model

In [4]:
# Initialize the Linear Regression model
model = LinearRegression()

# Train the model on the new, smaller training set
model.fit(X_train, y_train)

print("Linear Regression model trained successfully on the training set!")

Linear Regression model trained successfully on the training set!


 Predicting the Test Set Results

In [5]:
# Prepare the final test data with the same features
X_test_final = test_df[features]

# Predict the house prices for the final test set
final_predictions = model.predict(X_test_final)

# Create a new DataFrame for the submission
prediction_df = pd.DataFrame({'Id': test_df['Id'], 'SalePrice': final_predictions})

# Save the prediction file with the name 'prediction test.csv'
prediction_df.to_csv('prediction test.csv', index=False)

print("Predictions for the test set have been generated.")
print("The prediction file 'prediction test.csv' was created successfully!")

Predictions for the test set have been generated.
The prediction file 'prediction test.csv' was created successfully!
