In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Read the housing dataset
df = pd.read_csv('housing.csv')

# Display first few rows of the dataset
print("First 5 rows of the dataset:")
print(df.head())

# Get basic information about the dataset
print("\nDataset Info:")
print(df.info())

# Get statistical summary of the dataset
print("\nStatistical Summary:")
print(df.describe())

# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())

# Display column names
print("\nColumn Names:")
print(df.columns.tolist())

# Get shape of the dataset
print("\nDataset Shape:")
print(df.shape)

# Basic data operations
# Select specific columns
selected_columns = df[['price', 'bedrooms', 'bathrooms']]
print("\nSelected Columns:")
print(selected_columns.head())

# Filter data based on condition
filtered_data = df[df['price'] > df['price'].mean()]
print("\nHouses with price above mean:")
print(filtered_data.head())

# Group by operation
grouped_data = df.groupby('bedrooms')['price'].mean()
print("\nAverage price by number of bedrooms:")
print(grouped_data)


In [None]:
df

In [None]:
df.describe ()

In [None]:
df.columns

In [None]:
df.head()

In [None]:
# Create a scatter plot of price vs area, colored by airconditioning status
plt.figure(figsize=(10, 6))
plt.scatter(df[df['airconditioning'] == 'yes']['area'], 
            df[df['airconditioning'] == 'yes']['price'], 
            color='red', label='With AC')
plt.scatter(df[df['airconditioning'] == 'no']['area'], 
            df[df['airconditioning'] == 'no']['price'], 
            color='blue', label='Without AC')

plt.xlabel('Area')
plt.ylabel('Price')
plt.title('House Prices by Area and Air Conditioning Status')
plt.legend()
plt.show()


In [None]:
# Create subplots for each categorical variable
categorical_vars = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea', 'furnishingstatus']

fig, axes = plt.subplots(3, 3, figsize=(15, 15))
axes = axes.ravel()

for idx, var in enumerate(categorical_vars):
    # Create scatter plot for each category
    for category in df[var].unique():
        mask = df[var] == category
        axes[idx].scatter(df[mask]['area'], 
                         df[mask]['price'],
                         label=f'{var}={category}')
    
    axes[idx].set_xlabel('Area')
    axes[idx].set_ylabel('Price')
    axes[idx].set_title(f'House Prices by Area and {var}')
    axes[idx].legend()

# Remove empty subplot
axes[-1].remove()
axes[-2].remove()

plt.tight_layout()
plt.show()


In [None]:
# Create violin plots for numerical variables
numerical_vars = ['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'parking']

fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.ravel()

for idx, var in enumerate(numerical_vars):
    sns.violinplot(data=df, y=var, ax=axes[idx])
    axes[idx].set_title(f'Distribution of {var}')
    axes[idx].set_ylabel(var)

plt.tight_layout()
plt.show()

# Create violin plots for numerical variables grouped by airconditioning
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.ravel()

for idx, var in enumerate(numerical_vars):
    sns.violinplot(data=df, x='airconditioning', y=var, ax=axes[idx])
    axes[idx].set_title(f'{var} by Air Conditioning Status')
    axes[idx].set_xlabel('Air Conditioning')
    axes[idx].set_ylabel(var)

plt.tight_layout()
plt.show()


In [None]:
# Create a scatter plot of price vs area
plt.figure(figsize=(10, 6))
plt.scatter(df['area'], df['price'], alpha=0.5)
plt.xlabel('Area')
plt.ylabel('Price')
plt.title('House Price vs Area')
plt.grid(True)
plt.show()


In [None]:
# Create histograms for numerical variables
numerical_vars = ['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'parking']

fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.ravel()

for idx, var in enumerate(numerical_vars):
    sns.histplot(data=df, x=var, ax=axes[idx], kde=True)
    axes[idx].set_title(f'Distribution of {var}')
    axes[idx].set_xlabel(var)

plt.tight_layout()
plt.show()

# Create log-transformed histograms for price and area
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Log transform of price
sns.histplot(data=df, x=np.log(df['price']), ax=axes[0], kde=True)
axes[0].set_title('Distribution of Log(Price)')
axes[0].set_xlabel('Log(Price)')

# Log transform of area
sns.histplot(data=df, x=np.log(df['area']), ax=axes[1], kde=True)
axes[1].set_title('Distribution of Log(Area)')
axes[1].set_xlabel('Log(Area)')

plt.tight_layout()
plt.show()

# Create square root transformed histograms
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Square root transform of price
sns.histplot(data=df, x=np.sqrt(df['price']), ax=axes[0], kde=True)
axes[0].set_title('Distribution of Square Root(Price)')
axes[0].set_xlabel('Square Root(Price)')

# Square root transform of area
sns.histplot(data=df, x=np.sqrt(df['area']), ax=axes[1], kde=True)
axes[1].set_title('Distribution of Square Root(Area)')
axes[1].set_xlabel('Square Root(Area)')

plt.tight_layout()
plt.show()


In [None]:
df.describe()

In [None]:
# Create a single histogram for price
plt.figure(figsize=(10, 6))
sns.histplot(data=df, x='price', kde=True)
plt.title('Distribution of House Prices')
plt.xlabel('Price')
plt.ylabel('Count')
plt.show()


In [None]:
# Create a figure with subplots for all features
fig, axes = plt.subplots(3, 4, figsize=(15, 10))
axes = axes.flatten()

# Plot histograms for each feature
for i, column in enumerate(df.columns):
    if i < len(axes):  # Ensure we don't exceed the number of subplots
        sns.histplot(data=df, x=column, ax=axes[i], kde=True)
        axes[i].set_title(f'Distribution of {column}')
        axes[i].set_xlabel(column)
        axes[i].tick_params(axis='x', rotation=45)

# Remove any empty subplots
for i in range(len(df.columns), len(axes)):
    fig.delaxes(axes[i])

plt.tight_layout()
plt.show()


In [None]:
# Create a figure with two subplots side by side
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Plot histogram with KDE
sns.histplot(data=df, x='price', kde=True, ax=axes[0])
axes[0].set_title('Histogram with KDE of House Prices')
axes[0].set_xlabel('Price')
axes[0].set_ylabel('Count')

# Plot only KDE
sns.kdeplot(data=df, x='price', ax=axes[1])
axes[1].set_title('KDE Plot of House Prices')
axes[1].set_xlabel('Price')
axes[1].set_ylabel('Density')

plt.tight_layout()
plt.show()


In [None]:
# Separate the target variable (price) and apply log transformation
y = df['price']
y_log = np.log(y+1)

# Create a figure with two subplots side by side
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Plot original price distribution
sns.histplot(data=df, x='price', kde=True, ax=axes[0])
axes[0].set_title('Original Price Distribution')
axes[0].set_xlabel('Price')
axes[0].set_ylabel('Count')

# Plot log-transformed price distribution
sns.histplot(x=y_log, kde=True, ax=axes[1])
axes[1].set_title('Log-transformed Price Distribution')
axes[1].set_xlabel('Log(Price)')
axes[1].set_ylabel('Count')

plt.tight_layout()
plt.show()

# Print some statistics of the log-transformed prices
print("\nLog-transformed Price Statistics:")
print(y_log.describe())


In [None]:
# Create a figure with subplots for each feature
fig, axes = plt.subplots(4, 3, figsize=(15, 20))
axes = axes.flatten()

# List of features to plot
features = ['area', 'bedrooms', 'bathrooms', 'stories', 'parking', 
           'mainroad', 'guestroom', 'basement', 'hotwaterheating', 
           'airconditioning', 'prefarea', 'furnishingstatus']

# Plot each feature against log-transformed price
for idx, feature in enumerate(features):
    if df[feature].dtype in ['int64', 'float64']:
        # For numerical features
        sns.scatterplot(data=df, x=feature, y=y_log, ax=axes[idx])
    else:
        # For categorical features
        sns.boxplot(data=df, x=feature, y=y_log, ax=axes[idx])
    
    axes[idx].set_title(f'{feature} vs Log(Price)')
    axes[idx].set_ylabel('Log(Price)')
    
    # Rotate x-axis labels for better readability
    if df[feature].dtype not in ['int64', 'float64']:
        axes[idx].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

# Print correlation coefficients for numerical features
numerical_features = ['area', 'bedrooms', 'bathrooms', 'stories', 'parking']
correlations = df[numerical_features].corrwith(y_log)
print("\nCorrelation with Log(Price):")
print(correlations.sort_values(ascending=False))


In [None]:
# Log transform the area column
df['area_log'] = np.log(df['area'])

# Create a figure with subplots for each feature
fig, axes = plt.subplots(4, 3, figsize=(15, 20))
axes = axes.flatten()

# List of features to plot (using log-transformed area)
features = ['area_log', 'bedrooms', 'bathrooms', 'stories', 'parking', 
           'mainroad', 'guestroom', 'basement', 'hotwaterheating', 
           'airconditioning', 'prefarea', 'furnishingstatus']

# Plot each feature against log-transformed price
for idx, feature in enumerate(features):
    if df[feature].dtype in ['int64', 'float64']:
        # For numerical features
        sns.scatterplot(data=df, x=feature, y=y_log, ax=axes[idx])
    else:
        # For categorical features
        sns.boxplot(data=df, x=feature, y=y_log, ax=axes[idx])
    
    axes[idx].set_title(f'{feature} vs Log(Price)')
    axes[idx].set_ylabel('Log(Price)')
    
    # Rotate x-axis labels for better readability
    if df[feature].dtype not in ['int64', 'float64']:
        axes[idx].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

# Print correlation coefficients for numerical features (including log-transformed area)
numerical_features = ['area_log', 'bedrooms', 'bathrooms', 'stories', 'parking']
correlations = df[numerical_features].corrwith(y_log)
print("\nCorrelation with Log(Price):")
print(correlations.sort_values(ascending=False))


In [None]:
df

In [None]:
# Create a figure with 3 subplots
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# Plot area vs bedrooms
sns.scatterplot(data=df, x='area', y='bedrooms', ax=axes[0])
axes[0].set_title('Area vs Bedrooms')
axes[0].set_xlabel('Area')
axes[0].set_ylabel('Number of Bedrooms')

# Plot area vs bathrooms
sns.scatterplot(data=df, x='area', y='bathrooms', ax=axes[1])
axes[1].set_title('Area vs Bathrooms')
axes[1].set_xlabel('Area')
axes[1].set_ylabel('Number of Bathrooms')

# Plot area vs guestroom (boxplot since guestroom is categorical)
sns.boxplot(data=df, x='guestroom', y='area', ax=axes[2])
axes[2].set_title('Area vs Guestroom')
axes[2].set_xlabel('Guestroom')
axes[2].set_ylabel('Area')

plt.tight_layout()
plt.show()


In [None]:
# Encode categorical variables
df['guestroom'] = df['guestroom'].map({'yes': 1, 'no': 0})
df['basement'] = df['basement'].map({'yes': 1, 'no': 0})
df['hotwaterheating'] = df['hotwaterheating'].map({'yes': 1, 'no': 0})
df['airconditioning'] = df['airconditioning'].map({'yes': 1, 'no': 0})
df['prefarea'] = df['prefarea'].map({'yes': 1, 'no': 0})

# One-hot encode furnishing status
furnishing_dummies = pd.get_dummies(df['furnishingstatus'], prefix='furnishing')
df = pd.concat([df, furnishing_dummies], axis=1)

# Drop the original furnishingstatus column
df = df.drop('furnishingstatus', axis=1)

# Display the first few rows to verify the encoding
print("\nEncoded DataFrame:")
print(df.head())


In [None]:
df.head()

In [None]:
# One-hot encode parking
parking_dummies = pd.get_dummies(df['parking'], prefix='parking')
df = pd.concat([df, parking_dummies], axis=1)

# Encode mainroad
df['mainroad'] = df['mainroad'].map({'yes': 1, 'no': 0})

# Convert furnishing columns to 0/1
df['furnishing_furnished'] = df['furnishing_furnished'].astype(int)
df['furnishing_semi-furnished'] = df['furnishing_semi-furnished'].astype(int)
df['furnishing_unfurnished'] = df['furnishing_unfurnished'].astype(int)

# Drop original columns
df = df.drop(['parking', 'furnishingstatus'], axis=1, errors='ignore')

# Display the first few rows to verify the encoding
print("\nEncoded DataFrame:")
print(df.head())


In [None]:
df.head()

In [None]:
# Remove price and area columns
df = df.drop(['price', 'area'], axis=1)

# Convert parking columns to 0/1
df['parking_1'] = df['parking_1'].astype(int)
df['parking_2'] = df['parking_2'].astype(int)
df['parking_3'] = df['parking_3'].astype(int)

# Drop parking_0 column
df = df.drop('parking_0', axis=1)

# Display the first few rows to verify the changes
print("\nUpdated DataFrame:")
print(df.head())


In [None]:
df.head()

In [37]:
df = df.drop('furnishing_furnished', axis=1)

In [None]:
df.head()

In [None]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(
    df,  # Features (X)
    y_log ,              # Target variable (y)
    test_size=0.2,           # 20% for testing
    random_state=42          # For reproducibility
)

# Print the shapes of the resulting datasets
print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)


In [None]:
from sklearn.linear_model import LinearRegression

# Create and train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Print model coefficients and intercept
print("Model Coefficients:", model.coef_)
print("Model Intercept:", model.intercept_)


In [None]:
# Create a DataFrame with coefficients and feature names
coef_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Coefficient': model.coef_
})

# Sort by absolute coefficient value to see most important features
coef_df['Abs_Coefficient'] = abs(coef_df['Coefficient'])
coef_df = coef_df.sort_values('Abs_Coefficient', ascending=False)

# Display the coefficients
print("\nFeature Coefficients:")
print(coef_df[['Feature', 'Coefficient']])


In [None]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np

# Calculate metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print the metrics
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"R-squared (R²): {r2:.4f}")

# Create a scatter plot of actual vs predicted values
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Actual vs Predicted Values')
plt.tight_layout()
plt.show()


In [None]:
y_train_predict=model.predict(X_train)
mse_on_train = mean_squared_error(y_train, y_train_predict)
mse_on_train
r2_trian = r2_score(y_train, y_train_predict)
r2_trian