In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

# Load the datasets
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Separate target variable from training data
y_train = train_data['SalePrice']
X_train = train_data.drop(['SalePrice'], axis=1)

# Combine train and test data for preprocessing
combined_data = pd.concat([X_train, test_data], keys=['train', 'test'])

# Handle missing values
# We will fill numerical features with their median and categorical features with their mode
numerical_features = combined_data.select_dtypes(include=['int64', 'float64']).columns
categorical_features = combined_data.select_dtypes(include=['object']).columns

imputer_num = SimpleImputer(strategy='median')
imputer_cat = SimpleImputer(strategy='most_frequent')

combined_data[numerical_features] = imputer_num.fit_transform(combined_data[numerical_features])
combined_data[categorical_features] = imputer_cat.fit_transform(combined_data[categorical_features])

# Convert categorical features to numerical using Label Encoding
label_encoder = LabelEncoder()
for column in categorical_features:
    combined_data[column] = label_encoder.fit_transform(combined_data[column])

# Separate the combined data back into train and test sets
X_train = combined_data.loc['train']
X_test = combined_data.loc['test']

# Align the columns of X_train and X_test
X_train = X_train[X_test.columns]

# Initialize and train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict the prices for the test data
y_pred = model.predict(X_test)

# Save the predictions to a CSV file
output = pd.DataFrame({'Id': test_data['Id'], 'SalePrice': y_pred})
output.to_csv('test_with_predictions.csv', index=False)

print("Predictions saved to test_with_predictions.csv")


Predictions saved to test_with_predictions.csv


In [5]:
prediction = pd.read_csv("test_with_predictions.csv")
prediction


Unnamed: 0,Id,SalePrice
0,1461,111136.210224
1,1462,164194.088818
2,1463,166128.273385
3,1464,188235.651068
4,1465,187575.019185
...,...,...
1454,2915,70901.643402
1455,2916,57514.760217
1456,2917,141109.252695
1457,2918,118612.626573
