In [1]:
import pandas as pd 
import numpy as np
from sklearn.linear_model import LinearRegression


In [2]:
df = pd.read_csv("C:/Users/pc/Documents/workspace/house_sales.csv")
missing_city = df['city'].isnull().sum()
print(missing_city)
print(df['city'].unique())

0
['Silvertown' 'Riverford' 'Teasdale' 'Poppleton' '--']


In [3]:
import pandas as pd
df = pd.read_csv("C:/Users/pc/Documents/workspace/house_sales.csv")
# Create a cleaned version of the dataframe
clean_data = df.copy()

# Replace any missing values in the 'city' column with "Unknown"
clean_data['city'] = clean_data['city'].fillna('Unknown')
print(clean_data.head())

   house_id        city  sale_price   sale_date  months_listed  bedrooms  \
0   1217792  Silvertown       55943  2021-09-12            5.4         2   
1   1900913  Silvertown      384677  2021-01-17            6.3         5   
2   1174927   Riverford      281707  2021-11-10            6.9         6   
3   1773666  Silvertown      373251  2020-04-13            6.1         6   
4   1258487  Silvertown      328885  2020-09-24            8.7         5   

      house_type         area  
0  Semi-detached  107.8 sq.m.  
1       Detached  498.8 sq.m.  
2       Detached  542.5 sq.m.  
3           Det.  528.4 sq.m.  
4       Detached  477.1 sq.m.  


In [4]:
# Remove rows with missing 'sale_price' values
clean_data = df.dropna(subset=['sale_price'])
# Ensure that all sale prices are non-negative
clean_data = clean_data[clean_data['sale_price'] >= 0]
# Display the cleaned data
print(clean_data.head())

   house_id        city  sale_price   sale_date  months_listed  bedrooms  \
0   1217792  Silvertown       55943  2021-09-12            5.4         2   
1   1900913  Silvertown      384677  2021-01-17            6.3         5   
2   1174927   Riverford      281707  2021-11-10            6.9         6   
3   1773666  Silvertown      373251  2020-04-13            6.1         6   
4   1258487  Silvertown      328885  2020-09-24            8.7         5   

      house_type         area  
0  Semi-detached  107.8 sq.m.  
1       Detached  498.8 sq.m.  
2       Detached  542.5 sq.m.  
3           Det.  528.4 sq.m.  
4       Detached  477.1 sq.m.  


In [5]:
# Replace missing values in the 'sale_date' column with '2023-01-01'
df['sale_date'] = df['sale_date'].fillna('2023-01-01')
# Display the cleaned data
print(df.head())

   house_id        city  sale_price   sale_date  months_listed  bedrooms  \
0   1217792  Silvertown       55943  2021-09-12            5.4         2   
1   1900913  Silvertown      384677  2021-01-17            6.3         5   
2   1174927   Riverford      281707  2021-11-10            6.9         6   
3   1773666  Silvertown      373251  2020-04-13            6.1         6   
4   1258487  Silvertown      328885  2020-09-24            8.7         5   

      house_type         area  
0  Semi-detached  107.8 sq.m.  
1       Detached  498.8 sq.m.  
2       Detached  542.5 sq.m.  
3           Det.  528.4 sq.m.  
4       Detached  477.1 sq.m.  


In [6]:
# Calculate the mean number of months listed, rounded to one decimal place
mean_months_listed = round(df['months_listed'].mean(), 1)
# Replace missing values in the 'months_listed' column with the calculated mean
df['months_listed'] = df['months_listed'].fillna(mean_months_listed)


In [7]:
import pandas as pd
import math


# Calculate the mean number of bedrooms, rounded to the nearest integer
mean_bedrooms = round(df['bedrooms'].mean())

# Replace missing values in the 'bedrooms' column with the calculated mean
df['bedrooms'] = df['bedrooms'].fillna(mean_bedrooms)

# Ensure that all bedroom values are non-negative integers
df['bedrooms'] = df['bedrooms'].apply(lambda x: max(0, math.ceil(x)))



In [8]:
# Determine the most common house type
most_common_house_type = df['house_type'].mode()[0]

# Replace missing values in the 'house_type' column with the most common house type
df['house_type'] = df['house_type'].fillna(most_common_house_type)

In [9]:
# Convert the 'area' column to a numeric type (assuming it may have units like "sq.m.")
df['area'] = df['area'].str.replace(' sq.m.', '').astype(float)

# Calculate the mean area, rounded to one decimal place
mean_area = round(df['area'].mean(), 1)

# Replace missing values in the 'area' column with the calculated mean
df['area'] = df['area'].fillna(mean_area).round(1)


In [24]:
# Group by the number of bedrooms and calculate the average price and variance
price_by_rooms = df.groupby('bedrooms')['sale_price'].agg(
    avg_price=lambda x: round(x.mean(), 1),
    var_price=lambda x: round(x.var(), 1)
).reset_index()

# Display the resulting dataframe
print(price_by_rooms)

   bedrooms  avg_price     var_price
0         2    67076.4  5.652896e+08
1         3   154665.1  2.378289e+09
2         4   234704.6  1.725211e+09
3         5   301515.9  2.484328e+09
4         6   375741.3  3.924432e+09


In [34]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Load the training data
train_data = pd.read_csv("C:/Users/pc/Documents/workspace/train.csv")

# Load the validation data
validation_data = pd.read_csv("C:/Users/pc/Documents/workspace/validation.csv")
# Convert sale_date to ordinal (numeric) format
train_data['sale_date'] = pd.to_datetime(train_data['sale_date']).map(pd.Timestamp.toordinal)
validation_data['sale_date'] = pd.to_datetime(validation_data['sale_date']).map(pd.Timestamp.toordinal)

X_train = train_data.drop(columns=['sale_price', 'house_id'])
y_train = train_data['sale_price']
X_validation = validation_data.drop(columns=['house_id'])

# Identify categorical columns
categorical_cols = ['city', 'house_type']

# Preprocess the data (One-hot encoding for categorical features)
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first'), categorical_cols)],
    remainder='passthrough')

# Create a pipeline with the preprocessor and model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
])

# Fit the model
model.fit(X_train, y_train)

# Make predictions on the validation data
predictions = model.predict(X_validation)

# Create the result dataframe
base_result = pd.DataFrame({
    'house_id': validation_data['house_id'],
    'price': predictions
})

# Ensure the price column is rounded to one decimal place
base_result['price'] = base_result['price'].round(1)

# Display the resulting dataframe
print(base_result.head())



   house_id     price
0   1331375  122032.1
1   1630115  303641.2
2   1645745  384158.9
3   1336775  124233.6
4   1888274  271904.8


In [36]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

# Load the training and validation data
train_data = pd.read_csv("C:/Users/pc/Documents/workspace/train.csv")
validation_data = pd.read_csv("C:/Users/pc/Documents/workspace/validation.csv")

# Convert sale_date to ordinal (numeric) format
train_data['sale_date'] = pd.to_datetime(train_data['sale_date']).map(pd.Timestamp.toordinal)
validation_data['sale_date'] = pd.to_datetime(validation_data['sale_date']).map(pd.Timestamp.toordinal)

# Define the features and target
X_train = train_data.drop(columns=['sale_price', 'house_id'])
y_train = train_data['sale_price']
X_validation = validation_data.drop(columns=['house_id'])

# Identify categorical columns
categorical_cols = ['city', 'house_type']

# Preprocess the data (One-hot encoding for categorical features)
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first'), categorical_cols)],
    remainder='passthrough')

# Create a pipeline with the preprocessor and RandomForestRegressor model
comparison_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(random_state=0))
])

# Fit the model
comparison_model.fit(X_train, y_train)

# Make predictions on the validation data
comparison_predictions = comparison_model.predict(X_validation)

# Create the result dataframe
compare_result = pd.DataFrame({
    'house_id': validation_data['house_id'],
    'price': comparison_predictions
})

# Ensure the price column is rounded to one decimal place
compare_result['price'] = compare_result['price'].round(1)

# Display the resulting dataframe
print(compare_result.head())


   house_id     price
0   1331375   80285.5
1   1630115  306655.9
2   1645745  404077.8
3   1336775  108655.8
4   1888274  270539.2
