In [218]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

df = pd.read_csv(r'C:\Users\Sahan\Documents\Project_3\CSV\test.csv')

In [219]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33461 entries, 0 to 33460
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   price           33461 non-null  int64  
 1   Borough         33461 non-null  object 
 2   property_type   33461 non-null  object 
 3   Average Income  33461 non-null  int64  
 4   street_name     33461 non-null  object 
 5   Average Rent    33461 non-null  object 
 6   bedroom_number  33461 non-null  float64
 7   Mortgage        33461 non-null  float64
dtypes: float64(2), int64(2), object(4)
memory usage: 2.0+ MB


In [220]:
df

Unnamed: 0,price,Borough,property_type,Average Income,street_name,Average Rent,bedroom_number,Mortgage
0,889000,Bronx,MULTI_FAMILY,66878,E 235th Street,"$1,401",6.0,59829.70
1,699000,Bronx,SINGLE_FAMILY,66878,E 239th Street,"$1,401",4.0,47042.70
2,179000,Bronx,CONDO,66878,Webster Ave APT 2E,"$1,401",1.0,12046.70
3,599000,Bronx,SINGLE_FAMILY,66878,E 237th Street,"$1,401",3.0,40312.70
4,925000,Bronx,MULTI_FAMILY,66878,E 235th Street,"$1,401",6.0,62252.50
...,...,...,...,...,...,...,...,...
33456,800000,Manhattan,CONDO,175743,W 122nd St #8I,"$2,024",1.0,53840.00
33457,330000,Manhattan,CONDO,175743,W 138th St #5I,"$2,024",1.0,22209.00
33458,418800,Manhattan,CONDO,175743,Park Ter E #6,"$2,024",1.0,28185.24
33459,410000,Manhattan,CONDO,175743,Park Ter E APT D76,"$2,024",1.0,27593.00


In [221]:
# Step 1: Calculate Monthly Income
df['Monthly_Income'] = df['Average Income'] / 12

# Step 2: Define the affordability threshold
affordability_threshold = 0.5  # 50% of income
df['can_afford'] = (df['Mortgage'] <= (df['Monthly_Income'] * affordability_threshold)).astype(int)

# Display the updated DataFrame to confirm changes
print(df[['Average Income', 'Mortgage', 'Monthly_Income', 'can_afford']].head())


   Average Income  Mortgage  Monthly_Income  can_afford
0           66878   59829.7     5573.166667           0
1           66878   47042.7     5573.166667           0
2           66878   12046.7     5573.166667           0
3           66878   40312.7     5573.166667           0
4           66878   62252.5     5573.166667           0


In [222]:
df

Unnamed: 0,price,Borough,property_type,Average Income,street_name,Average Rent,bedroom_number,Mortgage,Monthly_Income,can_afford
0,889000,Bronx,MULTI_FAMILY,66878,E 235th Street,"$1,401",6.0,59829.70,5573.166667,0
1,699000,Bronx,SINGLE_FAMILY,66878,E 239th Street,"$1,401",4.0,47042.70,5573.166667,0
2,179000,Bronx,CONDO,66878,Webster Ave APT 2E,"$1,401",1.0,12046.70,5573.166667,0
3,599000,Bronx,SINGLE_FAMILY,66878,E 237th Street,"$1,401",3.0,40312.70,5573.166667,0
4,925000,Bronx,MULTI_FAMILY,66878,E 235th Street,"$1,401",6.0,62252.50,5573.166667,0
...,...,...,...,...,...,...,...,...,...,...
33456,800000,Manhattan,CONDO,175743,W 122nd St #8I,"$2,024",1.0,53840.00,14645.250000,0
33457,330000,Manhattan,CONDO,175743,W 138th St #5I,"$2,024",1.0,22209.00,14645.250000,0
33458,418800,Manhattan,CONDO,175743,Park Ter E #6,"$2,024",1.0,28185.24,14645.250000,0
33459,410000,Manhattan,CONDO,175743,Park Ter E APT D76,"$2,024",1.0,27593.00,14645.250000,0


In [223]:
df

Unnamed: 0,price,Borough,property_type,Average Income,street_name,Average Rent,bedroom_number,Mortgage,Monthly_Income,can_afford
0,889000,Bronx,MULTI_FAMILY,66878,E 235th Street,"$1,401",6.0,59829.70,5573.166667,0
1,699000,Bronx,SINGLE_FAMILY,66878,E 239th Street,"$1,401",4.0,47042.70,5573.166667,0
2,179000,Bronx,CONDO,66878,Webster Ave APT 2E,"$1,401",1.0,12046.70,5573.166667,0
3,599000,Bronx,SINGLE_FAMILY,66878,E 237th Street,"$1,401",3.0,40312.70,5573.166667,0
4,925000,Bronx,MULTI_FAMILY,66878,E 235th Street,"$1,401",6.0,62252.50,5573.166667,0
...,...,...,...,...,...,...,...,...,...,...
33456,800000,Manhattan,CONDO,175743,W 122nd St #8I,"$2,024",1.0,53840.00,14645.250000,0
33457,330000,Manhattan,CONDO,175743,W 138th St #5I,"$2,024",1.0,22209.00,14645.250000,0
33458,418800,Manhattan,CONDO,175743,Park Ter E #6,"$2,024",1.0,28185.24,14645.250000,0
33459,410000,Manhattan,CONDO,175743,Park Ter E APT D76,"$2,024",1.0,27593.00,14645.250000,0


In [224]:
# Mortgage parameters
annual_interest_rate = 0.0673  # NYC mortgage rate
monthly_interest_rate = annual_interest_rate / 12
loan_term_years = 30
n_payments = loan_term_years * 12  # Total number of payments
taxes_and_fees = 700  # Additional monthly costs

# Step 2: Calculate Monthly Mortgage Payment
df['Monthly_Mortgage'] = (
    df['price'] * (monthly_interest_rate * (1 + monthly_interest_rate)**n_payments) / 
    ((1 + monthly_interest_rate)**n_payments - 1)
) + taxes_and_fees

# Display the updated DataFrame to confirm changes
print(df[['price', 'Monthly_Mortgage']].head())


    price  Monthly_Mortgage
0  889000       6454.223278
1  699000       5224.411779
2  179000       1858.611886
3  599000       4577.142569
4  925000       6687.240194


In [225]:
# Step 3: Calculate Monthly Income
df['Monthly_Income'] = df['Average Income'] / 12

# Step 4: Define the affordability threshold
affordability_threshold = 0.4  # 40% of income
df['can_afford'] = (df['Monthly_Mortgage'] <= (df['Monthly_Income'] * affordability_threshold)).astype(int)

# Display the updated DataFrame
print(df[['Average Income', 'Monthly_Income', 'Monthly_Mortgage', 'can_afford']].head())


   Average Income  Monthly_Income  Monthly_Mortgage  can_afford
0           66878     5573.166667       6454.223278           0
1           66878     5573.166667       5224.411779           0
2           66878     5573.166667       1858.611886           1
3           66878     5573.166667       4577.142569           0
4           66878     5573.166667       6687.240194           0


In [226]:
df

Unnamed: 0,price,Borough,property_type,Average Income,street_name,Average Rent,bedroom_number,Mortgage,Monthly_Income,can_afford,Monthly_Mortgage
0,889000,Bronx,MULTI_FAMILY,66878,E 235th Street,"$1,401",6.0,59829.70,5573.166667,0,6454.223278
1,699000,Bronx,SINGLE_FAMILY,66878,E 239th Street,"$1,401",4.0,47042.70,5573.166667,0,5224.411779
2,179000,Bronx,CONDO,66878,Webster Ave APT 2E,"$1,401",1.0,12046.70,5573.166667,1,1858.611886
3,599000,Bronx,SINGLE_FAMILY,66878,E 237th Street,"$1,401",3.0,40312.70,5573.166667,0,4577.142569
4,925000,Bronx,MULTI_FAMILY,66878,E 235th Street,"$1,401",6.0,62252.50,5573.166667,0,6687.240194
...,...,...,...,...,...,...,...,...,...,...,...
33456,800000,Manhattan,CONDO,175743,W 122nd St #8I,"$2,024",1.0,53840.00,14645.250000,0,5878.153681
33457,330000,Manhattan,CONDO,175743,W 138th St #5I,"$2,024",1.0,22209.00,14645.250000,1,2835.988394
33458,418800,Manhattan,CONDO,175743,Park Ter E #6,"$2,024",1.0,28185.24,14645.250000,1,3410.763452
33459,410000,Manhattan,CONDO,175743,Park Ter E APT D76,"$2,024",1.0,27593.00,14645.250000,1,3353.803762


In [227]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33461 entries, 0 to 33460
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   price             33461 non-null  int64  
 1   Borough           33461 non-null  object 
 2   property_type     33461 non-null  object 
 3   Average Income    33461 non-null  int64  
 4   street_name       33461 non-null  object 
 5   Average Rent      33461 non-null  object 
 6   bedroom_number    33461 non-null  float64
 7   Mortgage          33461 non-null  float64
 8   Monthly_Income    33461 non-null  float64
 9   can_afford        33461 non-null  int32  
 10  Monthly_Mortgage  33461 non-null  float64
dtypes: float64(4), int32(1), int64(2), object(4)
memory usage: 2.7+ MB


In [228]:
# Drop unnecessary columns
df = df.drop(columns=['Average Rent', 'street_name'])




In [229]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33461 entries, 0 to 33460
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   price             33461 non-null  int64  
 1   Borough           33461 non-null  object 
 2   property_type     33461 non-null  object 
 3   Average Income    33461 non-null  int64  
 4   bedroom_number    33461 non-null  float64
 5   Mortgage          33461 non-null  float64
 6   Monthly_Income    33461 non-null  float64
 7   can_afford        33461 non-null  int32  
 8   Monthly_Mortgage  33461 non-null  float64
dtypes: float64(4), int32(1), int64(2), object(2)
memory usage: 2.2+ MB


In [230]:
print(df.columns)  # Just before defining X


Index(['price', 'Borough', 'property_type', 'Average Income', 'bedroom_number',
       'Mortgage', 'Monthly_Income', 'can_afford', 'Monthly_Mortgage'],
      dtype='object')


In [231]:
df.columns = df.columns.str.strip()
# Strip whitespace from column names
df.columns = df.columns.str.strip()

# Confirm column names
print(df.columns)

# Define features and target
X = df[['Monthly_Income', 'Monthly_Mortgage'] + [col for col in df.columns if 'Borough_' in col]]
y = df['can_afford']


Index(['price', 'Borough', 'property_type', 'Average Income', 'bedroom_number',
       'Mortgage', 'Monthly_Income', 'can_afford', 'Monthly_Mortgage'],
      dtype='object')


In [232]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Load your DataFrame
df = pd.read_csv(r'C:\Users\Sahan\Documents\Project_3\CSV\test.csv')

# Calculate Monthly Income
df['Monthly_Income'] = df['Average Income'] / 12

# Calculate Monthly Mortgage Payment
annual_interest_rate = 0.0673  # NYC mortgage rate
monthly_interest_rate = annual_interest_rate / 12
loan_term_years = 30
n_payments = loan_term_years * 12  # Total number of payments
taxes_and_fees = 700  # Additional monthly costs

df['Monthly_Mortgage'] = (
    df['price'] * (monthly_interest_rate * (1 + monthly_interest_rate) ** n_payments) /
    ((1 + monthly_interest_rate) ** n_payments - 1)
) + taxes_and_fees

# Strip whitespace from column names
df.columns = df.columns.str.strip()

# Confirm column names
print(df.columns)

# Define features and target
X = df[['Monthly_Income', 'Monthly_Mortgage'] + [col for col in df.columns if 'Borough_' in col]]
y = df['can_afford']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Predictions and evaluation
y_pred = model.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


Index(['price', 'Borough', 'property_type', 'Average Income', 'street_name',
       'Average Rent', 'bedroom_number', 'Mortgage', 'Monthly_Income',
       'Monthly_Mortgage'],
      dtype='object')


KeyError: 'can_afford'

In [233]:
# Step 2: Define the affordability threshold
affordability_threshold = 0.4  # 40% of income
df['can_afford'] = (df['Monthly_Mortgage'] <= (df['Monthly_Income'] * affordability_threshold)).astype(int)


In [234]:
df.columns = df.columns.str.strip()  # This will also help for can_afford


In [235]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Load your DataFrame
df = pd.read_csv(r'C:\Users\Sahan\Documents\Project_3\CSV\test.csv')

# Calculate Monthly Income
df['Monthly_Income'] = df['Average Income'] / 12

# Calculate Monthly Mortgage Payment
annual_interest_rate = 0.0673  # NYC mortgage rate
monthly_interest_rate = annual_interest_rate / 12
loan_term_years = 30
n_payments = loan_term_years * 12  # Total number of payments
taxes_and_fees = 700  # Additional monthly costs

df['Monthly_Mortgage'] = (
    df['price'] * (monthly_interest_rate * (1 + monthly_interest_rate) ** n_payments) /
    ((1 + monthly_interest_rate) ** n_payments - 1)
) + taxes_and_fees

# Step 2: Define the affordability threshold
affordability_threshold = 0.4  # 40% of income
df['can_afford'] = (df['Monthly_Mortgage'] <= (df['Monthly_Income'] * affordability_threshold)).astype(int)

# Strip whitespace from column names
df.columns = df.columns.str.strip()

# Check available columns after creating can_afford
print(df.columns)  # Should include 'can_afford'

# Define features and target
X = df[['Monthly_Income', 'Monthly_Mortgage'] + [col for col in df.columns if 'Borough_' in col]]
y = df['can_afford']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Predictions and evaluation
y_pred = model.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


Index(['price', 'Borough', 'property_type', 'Average Income', 'street_name',
       'Average Rent', 'bedroom_number', 'Mortgage', 'Monthly_Income',
       'Monthly_Mortgage', 'can_afford'],
      dtype='object')
[[5001    0]
 [   0 1692]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5001
           1       1.00      1.00      1.00      1692

    accuracy                           1.00      6693
   macro avg       1.00      1.00      1.00      6693
weighted avg       1.00      1.00      1.00      6693



In [236]:
from sklearn.model_selection import cross_val_score

# Cross-Validation
model = RandomForestClassifier(random_state=42)

# Perform 5-fold cross-validation
cv_scores = cross_val_score(model, X, y, cv=5)  # Change cv for more folds if needed

print(f"Cross-Validation Scores: {cv_scores}")
print(f"Mean CV Score: {cv_scores.mean()}")


Cross-Validation Scores: [0.91558345 1.         0.99267782 1.         1.        ]
Mean CV Score: 0.9816522539316977


In [237]:
from sklearn.model_selection import GridSearchCV

# Define a parameter grid
param_grid = {
    'n_estimators': [100, 200],  # Number of trees
    'max_depth': [None, 10, 20, 30],  # Tree depth
    'min_samples_split': [2, 5],  # Minimum samples to split an internal node
    'min_samples_leaf': [1, 2]  # Minimum samples at a leaf node
}

# Set up the grid search
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, 
                           cv=5, scoring='accuracy', verbose=2, n_jobs=-1)

# Fit the grid search
grid_search.fit(X, y)

# Best parameters and score
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Score:", grid_search.best_score_)


Fitting 5 folds for each of 32 candidates, totalling 160 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Done 160 out of 160 | elapsed:   13.5s finished


Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best Cross-Validation Score: 0.9818614589526184


In [238]:
# Use the best estimator found in the grid search
best_model = grid_search.best_estimator_

# Optionally: Fit it on the entire dataset (if you're not holding out a test set)
best_model.fit(X, y)


RandomForestClassifier(n_estimators=200, random_state=42)

In [239]:
# Assuming you have X_test and y_test defined
y_pred = best_model.predict(X_test)

# Evaluate
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


[[5001    0]
 [   0 1692]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5001
           1       1.00      1.00      1.00      1692

    accuracy                           1.00      6693
   macro avg       1.00      1.00      1.00      6693
weighted avg       1.00      1.00      1.00      6693



In [242]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# Fit the logistic regression model on the resampled data
model_smote = LogisticRegression(penalty='l2', solver='lbfgs', max_iter=1000)
model_smote.fit(X_resampled, y_resampled)

# Predict on the test set
y_pred_smote = model_smote.predict(X_test)

# Evaluate the model
print(confusion_matrix(y_test, y_pred_smote))
print(classification_report(y_test, y_pred_smote))

[[7551    0]
 [   0 2488]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7551
           1       1.00      1.00      1.00      2488

    accuracy                           1.00     10039
   macro avg       1.00      1.00      1.00     10039
weighted avg       1.00      1.00      1.00     10039



In [243]:
# Inspect coefficients
coefficients = model_l2.coef_[0]
features = X.columns
coef_df = pd.DataFrame({'Feature': features, 'Coefficient': coefficients})
print(coef_df.sort_values(by='Coefficient', ascending=False))


            Feature  Coefficient
0    Monthly_Income     0.321719
1  Monthly_Mortgage    -0.805729


In [244]:
plt.figure(figsize=(20, 10))
plot_tree(tree, feature_names=X.columns, filled=True, rounded=True, class_names=['Not Affordable', 'Affordable'])
plt.title("Decision Tree from Random Forest")
plt.show()


NameError: name 'tree' is not defined

<Figure size 1440x720 with 0 Axes>