# LOADING AND CLEANING DATASET 

In [15]:
import pandas as pd
import numpy as np

# -------------------------------
# 1. Load the dataset
# -------------------------------
input_path = 'collected.csv'
df = pd.read_csv(input_path)

# -------------------------------
# 2. Omit rows with many missing features until we have 1000 entries
# -------------------------------
df['missing_count'] = df.isnull().sum(axis=1)
df_sorted = df.sort_values('missing_count')
cleaned_df = df_sorted.head(1000).copy()
cleaned_df.drop(columns=['missing_count'], inplace=True)

# -------------------------------
# 3. Drop unwanted features (columns) such as 'title' and 'address'
# -------------------------------
cols_to_drop = ['title', 'address']
cleaned_df.drop(columns=cols_to_drop, errors='ignore', inplace=True)

# -------------------------------
# 4. Handle missing values without omitting data
# -------------------------------
for col in cleaned_df.columns:
    if cleaned_df[col].dtype == 'object':  
        # Categorical column: Fill missing values with the most frequent value (mode)
        cleaned_df[col].fillna(cleaned_df[col].mode()[0], inplace=True)
    else:
        # Numerical column: Fill missing values with the median
        cleaned_df[col].fillna(cleaned_df[col].median(), inplace=True)

# -------------------------------
# 5. Convert 'area' column to square feet without omitting the data
# -------------------------------
def convert_to_sqft(area):
    if isinstance(area, str):
        area = area.lower().strip()
        if 'sq.m' in area:
            num = float(area.replace('sq.m', '').strip())
            return num * 10.7639  # Convert to square feet
        elif 'aana' in area:
            num = float(area.replace('aana', '').strip())
            return num * 342.25
        elif 'ropani' in area:
            num = float(area.replace('ropani', '').strip())
            return num * 5476
        elif 'dhur' in area:
            num = float(area.replace('dhur', '').strip())
            return num * 182.25
        elif 'sq.ft' in area:
            return float(area.replace('sq.ft', '').strip())
    return area  # Return original value if conversion fails

if 'area' in cleaned_df.columns:
    cleaned_df['area_sqft'] = cleaned_df['area'].apply(convert_to_sqft)
    cleaned_df.drop(columns=['area'], inplace=True)

# -------------------------------
# 6. One-hot encode the 'city' column (kathmandu, lalitpur, bhaktapur)
# -------------------------------
if 'city' in cleaned_df.columns:
    cleaned_df['city'] = cleaned_df['city'].astype(str).str.lower()
    allowed_cities = ['kathmandu', 'lalitpur', 'bhaktapur']
    cleaned_df['city'] = cleaned_df['city'].apply(lambda x: x if x in allowed_cities else 'other')
    
    # One-hot encode city values
    cleaned_df['kathmandu'] = (cleaned_df['city'] == 'kathmandu').astype(int)
    cleaned_df['lalitpur'] = (cleaned_df['city'] == 'lalitpur').astype(int)
    cleaned_df['bhaktapur'] = (cleaned_df['city'] == 'bhaktapur').astype(int)
    cleaned_df.drop(columns=['city'], inplace=True)  # Drop the original city column

# -------------------------------
# 7. Binary encoding of amenities (each with a separate column)
# -------------------------------
amenity_cols = ['parking', 'water supply', 'frontyard', 'backyard', 'lawn']
for col in amenity_cols:
    if col in cleaned_df.columns:
        cleaned_df[col] = cleaned_df[col].apply(lambda x: 1 if str(x).strip().lower() in ['yes', 'true', '1'] else 0)

# -------------------------------
# 8. Convert all columns to numeric values (without omitting the data)
# -------------------------------
for col in cleaned_df.columns:
    cleaned_df[col] = pd.to_numeric(cleaned_df[col], errors='ignore')

# -------------------------------
# 9. Save the cleaned dataset to a new CSV file
# -------------------------------
output_path= 'cleaned.csv'
cleaned_df.to_csv(output_path,index=False)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  cleaned_df[col].fillna(cleaned_df[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  cleaned_df[col].fillna(cleaned_df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate obje

# Linear Regression Model

In [30]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load dataset
file_path = "cleaned.csv"
df = pd.read_csv(file_path)

# Data overview
print(df.info(), "\n")
print(df.head(), "\n")

# Handle missing values by replacing them with the mean of the respective column
df = df.dropna()

# Drop all non-numeric features
#df = df.select_dtypes(include=[np.number])

# Select features and target
features = ['Bedroom','Bathroom','Floors','Parking','Year','Views']
target = 'Price'

if not all(col in df.columns for col in features + [target]):
    raise ValueError("Missing required columns in the dataset!")

X = df[features].values
y = df[target].values.reshape(-1, 1)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Add bias term
X_train_scaled = np.c_[np.ones(X_train_scaled.shape[0]), X_train_scaled]
X_test_scaled = np.c_[np.ones(X_test_scaled.shape[0]), X_test_scaled]

# Initialize parameters
theta = np.zeros((X_train_scaled.shape[1], 1))

# Gradient Descent
def gradient_descent(X, y, theta, alpha, iterations):
    m = len(y)
    cost_history = []
    
    for i in range(iterations):
        predictions = X.dot(theta)
        gradient = (1/m) * X.T.dot(predictions - y)
        theta -= alpha * gradient
        cost = (1/(2*m)) * np.sum((predictions - y)**2)
        cost_history.append(cost)
        
        if i % 100 == 0:
            print(f"Iteration {i}: Cost {cost}")

    return theta, cost_history

# Train model
alpha, iterations = 0.01, 1000
theta, cost_history = gradient_descent(X_train_scaled, y_train, theta, alpha, iterations)

# Prediction function
def predict(X, theta):
    return X.dot(theta)

# Make predictions
y_pred = predict(X_test_scaled, theta)

# Model evaluation
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\nModel Evaluation:")
print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"R² Score: {r2}")

# Plot cost function convergence
plt.figure(figsize=(8, 5))
plt.plot(range(len(cost_history)), cost_history, color='blue')
plt.xlabel("Iterations")
plt.ylabel("Cost")
plt.title("Gradient Descent Convergence")
plt.show()

# Visualization: Actual vs Predicted Prices
plt.figure(figsize=(8, 5))
sns.scatterplot(x=y_test.flatten(), y=y_pred.flatten(), alpha=0.7)
plt.xlabel("Actual Prices (NPR)")
plt.ylabel("Predicted Prices (NPR)")
plt.title("Actual vs Predicted House Prices")
plt.show()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 18 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Title       1000 non-null   object 
 1   Address     1000 non-null   object 
 2   City        1000 non-null   object 
 3   Price       1000 non-null   int64  
 4   Bedroom     1000 non-null   int64  
 5   Bathroom    1000 non-null   int64  
 6   Floors      1000 non-null   float64
 7   Parking     1000 non-null   int64  
 8   Face        1000 non-null   object 
 9   Year        1000 non-null   float64
 10  Views       1000 non-null   object 
 11  Area        1000 non-null   object 
 12  Road        1000 non-null   object 
 13  Road Width  1000 non-null   object 
 14  Road Type   1000 non-null   object 
 15  Build Area  1000 non-null   object 
 16  Posted      1000 non-null   object 
 17  Amenities   1000 non-null   object 
dtypes: float64(2), int64(4), object(12)
memory usage: 140.8+ KB
None 

  

ValueError: could not convert string to float: '2.8K'