In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



In [None]:
import pandas as pd

# Load the training data
train_data = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
train_data.head()

In [None]:
# Load the test data
test_data = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')
test_data.head()

In [None]:
# Check the shape of the data
print('Training data shape:', train_data.shape)
print('Test data shape:', test_data.shape)

# Check the data types and missing values
print('\nTraining data info:')
print(train_data.info())
print('\nTest data info:')
print(test_data.info())

In [None]:
# Fill numerical missing values with median
for column in train_data.columns:
    if train_data[column].dtype != 'object':
        train_data[column].fillna(train_data[column].median(), inplace=True)
        if column in test_data.columns:
            test_data[column].fillna(test_data[column].median(), inplace=True)

# Fill categorical missing values with mode
for column in train_data.columns:
    if train_data[column].dtype == 'object':
        train_data[column].fillna(train_data[column].mode()[0], inplace=True)
        if column in test_data.columns:
            test_data[column].fillna(test_data[column].mode()[0], inplace=True)

# Check if there are any missing values left in the training data
print('Missing values in training data:', train_data.isnull().sum().sum())

# Check if there are any missing values left in the test data
print('Missing values in test data:', test_data.isnull().sum().sum())

In [None]:
# One-hot encode the categorical variables
train_data = pd.get_dummies(train_data)
test_data = pd.get_dummies(test_data)

# Align the training and test data, keep only columns present in both dataframes
train_data, test_data = train_data.align(test_data, join='inner', axis=1)

# Check the shapes of the data
print('Training data shape:', train_data.shape)
print('Test data shape:', test_data.shape)

In [None]:
 # Load the data again
train_data = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
test_data = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

# Fill numerical missing values with median
for column in train_data.columns:
    if train_data[column].dtype != 'object':
        train_data[column].fillna(train_data[column].median(), inplace=True)
        if column in test_data.columns:
            test_data[column].fillna(test_data[column].median(), inplace=True)

# Fill categorical missing values with mode
for column in train_data.columns:
    if train_data[column].dtype == 'object':
        train_data[column].fillna(train_data[column].mode()[0], inplace=True)
        if column in test_data.columns:
            test_data[column].fillna(test_data[column].mode()[0], inplace=True)

# Separate the target variable
y = train_data['SalePrice']
train_data.drop('SalePrice', axis=1, inplace=True)

# One-hot encode the categorical variables
train_data = pd.get_dummies(train_data)
test_data = pd.get_dummies(test_data)

# Align the training and test data, keep only columns present in both dataframes
train_data, test_data = train_data.align(test_data, join='inner', axis=1)

# Check the shapes of the data
print('Training data shape:', train_data.shape)
print('Test data shape:', test_data.shape)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Split the data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(train_data, y, test_size=0.2, random_state=0)

# Define the model
model = RandomForestRegressor(n_estimators=100, random_state=0)

# Fit the model
model.fit(X_train, y_train)

# Make predictions and evaluate the model
predictions = model.predict(X_valid)
mae = mean_absolute_error(y_valid, predictions)
print('Mean Absolute Error:', mae)

In [None]:
# Define the model with more trees
model = RandomForestRegressor(n_estimators=500, random_state=0)

# Fit the model
model.fit(X_train, y_train)

# Make predictions and evaluate the model
predictions = model.predict(X_valid)
mae = mean_absolute_error(y_valid, predictions)
print('Mean Absolute Error:', mae)

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

# Define the model
model = GradientBoostingRegressor(random_state=0)

# Fit the model
model.fit(X_train, y_train)

# Make predictions on the validation set and evaluate the model
predictions = model.predict(X_valid)
mae = mean_absolute_error(y_valid, predictions)
print('Mean Absolute Error:', mae)

# Make predictions on the test data
test_predictions = model.predict(test_data)



In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
# Calculate the feature importances
importances = model.feature_importances_

# Sort the feature importances in descending order and take the top 10
indices = np.argsort(importances)[::-1]
columns = X_train.columns.values[indices[:10]]
values = importances[indices][:10]

# Create a bar plot of the feature importances
plt.figure(figsize=(10, 5))
sns.barplot(x=columns, y=values, palette='Blues_r')
plt.title('Feature Importance')
plt.xlabel('Feature')
plt.ylabel('Importance')
plt.xticks(rotation=90)
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Calculate the feature importances
importances = model.feature_importances_

# Sort the feature importances in descending order and take the top 10
indices = np.argsort(importances)[::-1]
columns = X_train.columns.values[indices[:10]]
values = importances[indices][:10]

# Create a bar plot of the feature importances
plt.figure(figsize=(10, 5))
sns.barplot(x=columns, y=values, palette='Blues_r')
plt.title('Feature Importance')
plt.xlabel('Feature')
plt.ylabel('Importance')
plt.xticks(rotation=90)
plt.show()

In [None]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 500, 1000],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5]
}

# Initialize a Gradient Boosting Regressor
model = GradientBoostingRegressor(random_state=0)

# Initialize a Grid Search object
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='neg_mean_absolute_error', verbose=2, n_jobs=-1)

# Fit the Grid Search object to the data
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print('Best parameters:', best_params)

# Fit the model with the best parameters
model = GradientBoostingRegressor(**best_params, random_state=0)
model.fit(X_train, y_train)

# Make predictions on the validation set and evaluate the model
predictions = model.predict(X_valid)
mae = mean_absolute_error(y_valid, predictions)
print('Mean Absolute Error:', mae)

In [None]:
test_predictions = model.predict(test_data)

# Create a DataFrame for submission
submission = pd.DataFrame({
    'Id': test_data['Id'],
    'SalePrice': test_predictions
})

# Write the submission file to csv
submission.to_csv('/kaggle/working/submission.csv', index=False)