In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('./datasets/housing/nyc-rolling-sales.csv')
df.dtypes

Unnamed: 0                         int64
BOROUGH                            int64
NEIGHBORHOOD                      object
BUILDING CLASS CATEGORY           object
TAX CLASS AT PRESENT              object
BLOCK                              int64
LOT                                int64
EASE-MENT                         object
BUILDING CLASS AT PRESENT         object
ADDRESS                           object
APARTMENT NUMBER                  object
ZIP CODE                           int64
RESIDENTIAL UNITS                  int64
COMMERCIAL UNITS                   int64
TOTAL UNITS                        int64
LAND SQUARE FEET                  object
GROSS SQUARE FEET                 object
YEAR BUILT                         int64
TAX CLASS AT TIME OF SALE          int64
BUILDING CLASS AT TIME OF SALE    object
SALE PRICE                        object
SALE DATE                         object
dtype: object

In [2]:
def convert(columns, convertTo, data):
    if convertTo == 'string':
        for col in columns:
            data[col] = data[col].astype(str).str.strip()
    elif convertTo == 'int64':
        for col in columns:
            data[col] = pd.to_numeric(data[col], errors='coerce').fillna(0).astype(np.int64)
    return data

# Columns to convert to int64
int_columns = ['SALE PRICE', 'LAND SQUARE FEET', 'GROSS SQUARE FEET']

# Columns to convert to string
string_columns = ['NEIGHBORHOOD', 'BUILDING CLASS CATEGORY', 'TAX CLASS AT PRESENT']

# Apply the convert function to the DataFrame
df = convert(int_columns, 'int64', df)
df = convert(string_columns, 'string', df)

In [3]:
df.dtypes

Unnamed: 0                         int64
BOROUGH                            int64
NEIGHBORHOOD                      object
BUILDING CLASS CATEGORY           object
TAX CLASS AT PRESENT              object
BLOCK                              int64
LOT                                int64
EASE-MENT                         object
BUILDING CLASS AT PRESENT         object
ADDRESS                           object
APARTMENT NUMBER                  object
ZIP CODE                           int64
RESIDENTIAL UNITS                  int64
COMMERCIAL UNITS                   int64
TOTAL UNITS                        int64
LAND SQUARE FEET                   int64
GROSS SQUARE FEET                  int64
YEAR BUILT                         int64
TAX CLASS AT TIME OF SALE          int64
BUILDING CLASS AT TIME OF SALE    object
SALE PRICE                         int64
SALE DATE                         object
dtype: object

In [4]:
df.drop(['SALE DATE', 'APARTMENT NUMBER', 'ADDRESS'], axis=1)

Unnamed: 0.1,Unnamed: 0,BOROUGH,NEIGHBORHOOD,BUILDING CLASS CATEGORY,TAX CLASS AT PRESENT,BLOCK,LOT,EASE-MENT,BUILDING CLASS AT PRESENT,ZIP CODE,RESIDENTIAL UNITS,COMMERCIAL UNITS,TOTAL UNITS,LAND SQUARE FEET,GROSS SQUARE FEET,YEAR BUILT,TAX CLASS AT TIME OF SALE,BUILDING CLASS AT TIME OF SALE,SALE PRICE
0,4,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2A,392,6,,C2,10009,5,0,5,1633,6440,1900,2,C2,6625000
1,5,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2,399,26,,C7,10009,28,3,31,4616,18690,1900,2,C7,0
2,6,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2,399,39,,C7,10009,16,1,17,2212,7803,1900,2,C7,0
3,7,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2B,402,21,,C4,10009,10,0,10,2272,6794,1913,2,C4,3936272
4,8,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2A,404,55,,C2,10009,6,0,6,2369,4615,1900,2,C2,8000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84543,8409,5,WOODROW,02 TWO FAMILY DWELLINGS,1,7349,34,,B9,10309,2,0,2,2400,2575,1998,1,B9,450000
84544,8410,5,WOODROW,02 TWO FAMILY DWELLINGS,1,7349,78,,B9,10309,2,0,2,2498,2377,1998,1,B9,550000
84545,8411,5,WOODROW,02 TWO FAMILY DWELLINGS,1,7351,60,,B2,10309,2,0,2,4000,1496,1925,1,B2,460000
84546,8412,5,WOODROW,22 STORE BUILDINGS,4,7100,28,,K6,10309,0,7,7,208033,64117,2001,4,K6,11693337


In [10]:
df = df[:10000]
len(df)

1000

In [11]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# Assuming df contains your dataset
X = df.drop('SALE PRICE', axis=1)
y = df['SALE PRICE']

cat_attribs = ['NEIGHBORHOOD', 'TAX CLASS AT PRESENT', 'BUILDING CLASS AT PRESENT', 'BUILDING CLASS AT TIME OF SALE']
cat_preprocessor = ColumnTransformer([('one_hot', OneHotEncoder(), cat_attribs)])

data_prepared = cat_preprocessor.fit_transform(X)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(data_prepared, y, test_size=0.2, random_state=42)

# Initialize and train the Linear Regression model
model = LogisticRegression()
model.fit(X_train, y_train)


In [12]:
housing_predictions = model.predict(X_test)
r2_score = model.score(X_test, y_test)
r2_score

0.25