In [7]:
from sklearn.metrics import r2_score
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.metrics import r2_score

# Load the dataset
data = pd.read_csv('2019&2020.csv')

# Preprocess the location column using OneHotEncoder
column_trans = make_column_transformer(
    (OneHotEncoder(), ['location']),
    remainder='passthrough'
)

# Split the dataset into training and testing sets
X = data.drop('price_per_sqft', axis=1)
y = data['price_per_sqft']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Train the linear regression model
lr = LinearRegression()
lr.fit(column_trans.fit_transform(X_train), y_train)

# Evaluate the model on the testing set
y_pred_lr = lr.predict(column_trans.transform(X_test))
r2 = r2_score(y_test, y_pred_lr)
print(f'R^2 score for Linear Regression: {r2}')

R^2 score for Linear Regression: -0.1653481210206882


In [22]:
import pandas as pd

# Create a new DataFrame for the year 2021
new_data = pd.DataFrame({
    'location': ['Nipania', 'Bhicholi Mardana', 'Rau', 'Bhawrasla', 'Mahalakshmi Nagar', 'Manglia', 'Lasudia Mori', 'Palakhedi', 'Omex City', 'Vijay Nagar'],
    'year': [2021, 2021, 2021, 2021, 2021, 2021, 2021, 2021, 2021, 2021]
})

# Preprocess the new data using OneHotEncoder
new_data_transformed = column_trans.transform(new_data)

# Use the trained model to predict the price_per_sqft for the year 2021
new_data['price_per_sqft'] = lr.predict(new_data_transformed)

# Print the predicted prices for the year 2021
print(new_data)

            location  year  price_per_sqft
0            Nipania  2021    18716.327708
1   Bhicholi Mardana  2021     6735.894770
2                Rau  2021     5590.394779
3          Bhawrasla  2021     5026.795175
4  Mahalakshmi Nagar  2021    11806.894770
5            Manglia  2021     1059.193513
6       Lasudia Mori  2021     6918.661042
7          Palakhedi  2021     1193.193513
8          Omex City  2021     2331.894779
9        Vijay Nagar  2021    11920.661042


In [34]:
def predictions(model):
    new_data = pd.DataFrame({
    'location': ['Nipania', 'Bhicholi Mardana', 'Rau', 'Bhawrasla', 'Mahalakshmi Nagar', 'Manglia', 'Lasudia Mori', 'Palakhedi', 'Omex City', 'Vijay Nagar'],
    'year': [2021, 2021, 2021, 2021, 2021, 2021, 2021, 2021, 2021, 2021]
    })

    # Preprocess the new data using OneHotEncoder
    new_data_transformed = column_trans.transform(new_data)

    # Use the trained model to predict the price_per_sqft for the year 2021
    new_data['price_per_sqft'] = model.predict(new_data_transformed)

    # Print the predicted prices for the year 2021
    print(new_data)

In [38]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import r2_score

def find_best_model(X, y):
    """
    Trains and evaluates several machine learning models for predicting the `price_per_sqft` based on the `location` and `year`.

    Parameters:
    X (pandas.DataFrame): The input data with columns for `location` and `year`.
    y (pandas.Series): The target data with the `price_per_sqft`.

    Returns:
    (tuple): The best model and its R^2 score.
    """
    # Preprocess the location column using OneHotEncoder
    column_trans = make_column_transformer(
        (OneHotEncoder(), ['location']),
        remainder='passthrough'
    )

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

    # Train and evaluate several machine learning models
    models = [
        ('Linear Regression', LinearRegression()),
        ('Lasso', Lasso()),
        ('Ridge', Ridge()),
        ('Random Forest', RandomForestRegressor()),
        ('Support Vector Regression', SVR())
    ]

    best_model = None
    best_score = -1

    for name, model in models:
        model.fit(column_trans.fit_transform(X_train), y_train)
        y_pred = model.predict(column_trans.transform(X_test))
        score = r2_score(y_test, y_pred)
        predictions(model)
        print(f'R^2 score for {name}: {score}')

        if score > best_score:
            best_model = model
            best_score = score

    return best_model, best_score

In [39]:
data = pd.read_csv('2019&2020.csv')
X = data.drop('price_per_sqft', axis=1)
y = data['price_per_sqft']

best_model, best_score = find_best_model(X, y)
print(f'Best model: {best_model.__class__.__name__}, R^2 score: {best_score}')

            location  year  price_per_sqft
0            Nipania  2021    18716.327708
1   Bhicholi Mardana  2021     6735.894770
2                Rau  2021     5590.394779
3          Bhawrasla  2021     5026.795175
4  Mahalakshmi Nagar  2021    11806.894770
5            Manglia  2021     1059.193513
6       Lasudia Mori  2021     6918.661042
7          Palakhedi  2021     1193.193513
8          Omex City  2021     2331.894779
9        Vijay Nagar  2021    11920.661042
R^2 score for Linear Regression: -0.1653481210206882
            location  year  price_per_sqft
0            Nipania  2021    18713.015145
1   Bhicholi Mardana  2021     6767.976968
2                Rau  2021     5638.663489
3          Bhawrasla  2021     5062.362852
4  Mahalakshmi Nagar  2021    11807.007839
5            Manglia  2021     1144.806256
6       Lasudia Mori  2021     6940.339691
7          Palakhedi  2021     1278.890244
8          Omex City  2021     2380.116382
9        Vijay Nagar  2021    11917.540327
R