<a href="https://colab.research.google.com/github/TheAECode/BirthDayInvite/blob/main/PRODIGY_ML_01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
!pip install pandas numpy matplotlib scikit-learn




In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

# Load the dataset
train_data = pd.read_csv('/content/train.csv')

# Handle missing values
train_data['LotFrontage'] = train_data['LotFrontage'].fillna(train_data['LotFrontage'].median())

# Feature Engineering
train_data['TotalHouseArea'] = train_data['1stFlrSF'] + train_data['2ndFlrSF'] + train_data['TotalBsmtSF']
train_data['HouseAge'] = train_data['YrSold'] - train_data['YearBuilt']

# Select features and target based on correlation and feature engineering
features = train_data[['OverallQual', 'GrLivArea', 'GarageCars', 'TotalHouseArea', 'FullBath', 'TotRmsAbvGrd', 'HouseAge']]
target = train_data['SalePrice']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Linear Regression Model
linear_model = LinearRegression()
linear_model.fit(X_train_scaled, y_train)
linear_predictions = linear_model.predict(X_test_scaled)
linear_rmse = np.sqrt(mean_squared_error(y_test, linear_predictions))
print(f'Linear Regression RMSE: {linear_rmse}')

# Ridge Regression
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train_scaled, y_train)
ridge_predictions = ridge_model.predict(X_test_scaled)
ridge_rmse = np.sqrt(mean_squared_error(y_test, ridge_predictions))
print(f'Ridge Regression RMSE: {ridge_rmse}')

# Lasso Regression
lasso_model = Lasso(alpha=0.01)
lasso_model.fit(X_train_scaled, y_train)
lasso_predictions = lasso_model.predict(X_test_scaled)
lasso_rmse = np.sqrt(mean_squared_error(y_test, lasso_predictions))
print(f'Lasso Regression RMSE: {lasso_rmse}')

# Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)
rf_rmse = np.sqrt(mean_squared_error(y_test, rf_predictions))
print(f'Random Forest RMSE: {rf_rmse}')

# Cross-Validation for Linear Regression
cv_rmse = -cross_val_score(linear_model, scaler.fit_transform(features), target, scoring='neg_root_mean_squared_error', cv=5)
print(f'Cross-validated Linear Regression RMSE: {np.mean(cv_rmse)}')


Linear Regression RMSE: 39575.26836527863
Ridge Regression RMSE: 39580.766394110855
Lasso Regression RMSE: 39575.269865437884
Random Forest RMSE: 29940.26283968774
Cross-validated Linear Regression RMSE: 38750.19059516913
