#IMPORTING LIBRARIES

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import cross_val_score, KFold
from sklearn.model_selection import GridSearchCV


- READING CSV FILES

In [2]:
import pandas as pd
df_1= pd.read_csv('kijiji.csv')
df_2 = pd.read_csv('CSDpop.csv')

In [3]:
df_1.columns

Index(['CSDUID', 'CSDNAME', 'Latitude', 'Longitude', 'adId', 'Title', 'Type',
       'Price', 'Location', 'Bedrooms', 'Bathrooms', 'Hydro', 'Heat', 'Water',
       'Size', 'Agreement Type', 'URL', 'Date Posted'],
      dtype='object')

In [4]:
df_1 = df_1.drop(['CSDNAME','Longitude', 'Location', 'Title','adId','URL','Date Posted'],axis=1)

In [5]:
df_1['Size'] = pd.to_numeric(df_1['Size'], errors='coerce' )

-  This code is setting any "Size" values in the DataFrame df_1 that are less than 200 or greater than 7000 to NaN

In [6]:
import numpy as np
for index, row in df_1.iterrows ():
   if row["Size"] < 200 or row["Size"] > 7000:
      df_1.at [index, 'Size'] = np. nan

- FILLING NULL VALUES WITH MEAN IN SIZE COLUMN.

In [7]:
df_1['Size'].fillna(value=df_1.Size.mean(), inplace=True)

In [8]:
df_1['Price'] = df_1['Price'].str.replace('$','')
df_1['Price'] = df_1['Price'].str.replace(',','')
df_1['Price'] = pd.to_numeric(df_1['Price'], errors='coerce' )

In [9]:
merged_df_1 = df_1.merge(df_2, on ='CSDUID', how= 'left')

In [10]:
merged_df_1.columns

Index(['CSDUID', 'Latitude', 'Type', 'Price', 'Bedrooms', 'Bathrooms', 'Hydro',
       'Heat', 'Water', 'Size', 'Agreement Type', 'Population'],
      dtype='object')

In [11]:
merged_df_1 = merged_df_1.drop(['CSDUID'],axis=1)

In [12]:
merged_df_1['Type'].value_counts()

Type
Apartment         9685
House             5526
Basement          3689
Condo             3419
Townhouse         2399
Duplex/Triplex    1014
Name: count, dtype: int64

- This code categorizes the prices in the "Price" column of merged_df_1 into three categories ("low", "Medium", "High") based on the defined conditions, and stores the result in a new column "Price_Category".

In [13]:
def priceConvert(p):
  if p <= 1400:
    return "low"
  elif p <= 2000 and p >= 1400:
    return "Medium"
  elif p > 2000:
    return "High"

merged_df_1["Price_Category"] = merged_df_1["Price"].apply(priceConvert)

In [14]:
merged_df_1['Price_Category'].head()

0    Medium
1       low
2      High
3       low
4      High
Name: Price_Category, dtype: object

In [15]:
merged_df_1.head()

Unnamed: 0,Latitude,Type,Price,Bedrooms,Bathrooms,Hydro,Heat,Water,Size,Agreement Type,Population,Price_Category
0,45.256161,Apartment,1425.0,Bedrooms: 2,Bathrooms: 1,N,Y,Y,950.0,1 Year,14211,Medium
1,45.192003,Apartment,1100.0,Bedrooms: 2,Bathrooms: 1,N,N,Y,780.0,1 Year,14211,low
2,45.146672,House,2700.0,Bedrooms: 3,Bathrooms: 2.5,N,Y,Y,693.228433,1 Year,14211,High
3,45.143726,House,1200.0,Bedrooms: 2,Bathrooms: 1.5,N,N,Y,900.0,1 Year,14211,low
4,45.068973,House,4000.0,Bedrooms: 2 + Den,Bathrooms: 3,N,Y,Y,693.228433,1 Year,14211,High


In [16]:
merged_df_1['Type'] = merged_df_1['Type'].astype('category').cat.codes

- This process transforms categorical columns into a format that machine learning algorithms can better understand, where each category is represented by a binary (0 or 1) value in its own column.

In [17]:
one_hot_encoded1 = pd.get_dummies(merged_df_1['Type'], prefix='PropType')
one_hot_encoded2 = pd.get_dummies(merged_df_1['Bedrooms'], prefix='Bedrooms')
one_hot_encoded3 = pd.get_dummies(merged_df_1['Bathrooms'], prefix='Bathrooms')
one_hot_encoded4 = pd. get_dummies(merged_df_1['Hydro'], prefix='Hydro')
one_hot_encoded5 = pd. get_dummies(merged_df_1['Heat'], prefix='Heat')
one_hot_encoded6 = pd. get_dummies(merged_df_1['Water'], prefix='Water')
one_hot_encoded7 = pd. get_dummies(merged_df_1['Agreement Type'], prefix='Agreement')


#Concatenate the one-hot encoded columns with the original Dataframe
df_encoded = pd.concat([merged_df_1, one_hot_encoded1,one_hot_encoded2, one_hot_encoded3, one_hot_encoded4, one_hot_encoded5, one_hot_encoded6,one_hot_encoded7], axis=1)

In [18]:
df_encoded = df_encoded.drop(['Type','Bedrooms','Bathrooms','Hydro','Heat','Water','Agreement Type'], axis=1)

In [19]:
df_encoded.head()

Unnamed: 0,Latitude,Price,Size,Population,Price_Category,PropType_0,PropType_1,PropType_2,PropType_3,PropType_4,...,Bathrooms_Bathrooms: 6+,Hydro_N,Hydro_Y,Heat_N,Heat_Y,Water_N,Water_Y,Agreement_1 Year,Agreement_Month-to-month,Agreement_Not Available
0,45.256161,1425.0,950.0,14211,Medium,True,False,False,False,False,...,False,True,False,False,True,False,True,True,False,False
1,45.192003,1100.0,780.0,14211,low,True,False,False,False,False,...,False,True,False,True,False,False,True,True,False,False
2,45.146672,2700.0,693.228433,14211,High,False,False,False,False,True,...,False,True,False,False,True,False,True,True,False,False
3,45.143726,1200.0,900.0,14211,low,False,False,False,False,True,...,False,True,False,True,False,False,True,True,False,False
4,45.068973,4000.0,693.228433,14211,High,False,False,False,False,True,...,False,True,False,False,True,False,True,True,False,False


In [20]:
y = df_encoded["Price_Category"]
X = df_encoded.drop(["Price_Category"], axis=1)

In [21]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# A scaler: StandardScaler()
scaler = StandardScaler()
# Fit the scaler to the training data and transform it
X_train = scaler.fit_transform(X_train)

# Transform the testing data using the fitted scaler
X_test = scaler.transform(X_test)

In [22]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# Fit the scaler to the training data and transform it
X_train = scaler.fit_transform(X_train)

# Transform the testing data using the fitted scaler
X_test = scaler.transform(X_test)

In [23]:
X_scaled = X

In [24]:
y.dtypes

dtype('O')

# This code trains a logistic regression model, evaluates its performance using accuracy, and provides a detailed classification report containing additional performance metrics.







In [25]:
from sklearn.metrics import classification_report
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
accuracylg = accuracy_score(y_test, y_pred)
print("Logistic Regression :", accuracylg)
report = classification_report(y_test, y_pred)
print("Classification Report:", report)

Logistic Regression : 0.9953370895667379
Classification Report:               precision    recall  f1-score   support

        High       1.00      0.99      1.00      2559
      Medium       0.99      1.00      0.99      1868
         low       0.99      1.00      1.00       720

    accuracy                           1.00      5147
   macro avg       0.99      1.00      1.00      5147
weighted avg       1.00      1.00      1.00      5147



- This code assesses the performance of the logistic regression model using k-fold cross-validation, where the data is split into 5 folds, shuffled, and the model is trained and evaluated on each fold

In [None]:
kf1 = KFold(n_splits=9 shuffle=True, random_state=42)
cv_scores_kf1 = cross_val_score(logreg, X_scaled, y, cv=kf1)
print("Cross-validation scores (KFold 1):", np.mean(cv_scores_kf1))

In [27]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Define a smaller parameter grid for hyperparameter tuning
param_grid = {
    'C': [0.1, 1, 10],          # Reduced regularization parameter options
    'penalty': ['l1', 'l2'],    # Penalty term
    'solver': ['liblinear']     # Use only 'liblinear' solver
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(LogisticRegression(max_iter=1000), param_grid, cv=None, scoring='accuracy')
grid_search.fit(X_scaled, y)

# Get the best logistic regression model
best_logreg_model = grid_search.best_estimator_

# Predict on the test set
y_pred = best_logreg_model.predict(X_test)

# Calculate accuracy
accuracy_lgscv = accuracy_score(y_test, y_pred)

# Print the best parameters found
print("Best Parameters:", grid_search.best_params_)

# Get the mean cross-validation score for the best model
print("Mean Cross-validation Score:", grid_search.best_score_)


Best Parameters: {'C': 1, 'penalty': 'l1', 'solver': 'liblinear'}
Mean Cross-validation Score: 0.9800638983039713




In [28]:

from sklearn.neighbors import KNeighborsClassifier

# Create KNN classifier
knn = KNeighborsClassifier()

# Fit the model
knn.fit(X_train, y_train)

# Predict on the test set
y_pred_knn = knn.predict(X_test)

# Calculate accuracy
accuracy_knn = accuracy_score(y_test, y_pred_knn)

# Print accuracy
print("KNN Accuracy:", accuracy_knn)

# Generate classification report
report_knn = classification_report(y_test, y_pred_knn)
print("KNN Classification Report:")
print(report_knn)


KNN Accuracy: 0.8612784146104527
KNN Classification Report:
              precision    recall  f1-score   support

        High       0.91      0.93      0.92      2559
      Medium       0.81      0.83      0.82      1868
         low       0.83      0.71      0.76       720

    accuracy                           0.86      5147
   macro avg       0.85      0.82      0.83      5147
weighted avg       0.86      0.86      0.86      5147



In [29]:
from sklearn.model_selection import cross_val_score, KFold
from sklearn.neighbors import KNeighborsClassifier

# Define KNN classifier
knn = KNeighborsClassifier()

# Initialize KFold cross-validator
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation
cv_scores_knn = cross_val_score(knn, X_scaled, y, cv=kf)

# Print mean cross-validation score
print("Mean Cross-validation Score (KNN):", np.mean(cv_scores_knn))


Mean Cross-validation Score (KNN): 0.9865925165845102


In [30]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Define the parameter grid for KNN hyperparameter tuning
param_grid_knn = {
    'n_neighbors': [3, 5, 7],   # Number of neighbors
    'weights': ['uniform', 'distance'],  # Weight function
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],  # Algorithm to compute nearest neighbors
}

# Perform grid search with cross-validation for KNN
grid_search_knn = GridSearchCV(KNeighborsClassifier(), param_grid_knn, cv=None, scoring='accuracy')
grid_search_knn.fit(X_scaled, y)

# Get the best KNN model
best_knn_model = grid_search_knn.best_estimator_

# Predict on the test set
y_pred_knn = best_knn_model.predict(X_test)

# Calculate accuracy
accuracy_knn = accuracy_score(y_test, y_pred_knn)

# Print the best parameters found for KNN
print("Best Parameters (KNN):", grid_search_knn.best_params_)

# Get the mean cross-vali




Best Parameters (KNN): {'algorithm': 'auto', 'n_neighbors': 7, 'weights': 'uniform'}


In [31]:
from sklearn.tree import DecisionTreeClassifier

# Create Decision Tree classifier
dt = DecisionTreeClassifier()

# Fit the model
dt.fit(X_train, y_train)

# Predict on the test set
y_pred_dt = dt.predict(X_test)

# Calculate accuracy
accuracy_dt = accuracy_score(y_test, y_pred_dt)

# Print accuracy
print("Decision Tree Accuracy:", accuracy_dt)

# Generate classification report
report_dt = classification_report(y_test, y_pred_dt)
print("Decision Tree Classification Report:")
print(report_dt)


Decision Tree Accuracy: 1.0
Decision Tree Classification Report:
              precision    recall  f1-score   support

        High       1.00      1.00      1.00      2559
      Medium       1.00      1.00      1.00      1868
         low       1.00      1.00      1.00       720

    accuracy                           1.00      5147
   macro avg       1.00      1.00      1.00      5147
weighted avg       1.00      1.00      1.00      5147



In [32]:
from sklearn.tree import DecisionTreeClassifier

# Define Decision Tree classifier
dt = DecisionTreeClassifier()

# Perform cross-validation
cv_scores_dt = cross_val_score(dt, X_scaled, y, cv=kf)

# Print mean cross-validation score
print("Mean Cross-validation Score (Decision Tree):", np.mean(cv_scores_dt))


Mean Cross-validation Score (Decision Tree): 1.0


In [33]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Define the parameter grid for Decision Tree hyperparameter tuning
param_grid_dt = {
    'criterion': ['gini', 'entropy'],   # Split criterion
    'max_depth': [None, 10, 20, 30],     # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],     # Minimum number of samples required to split a node
    'min_samples_leaf': [1, 2, 4]        # Minimum number of samples required at each leaf node
}

# Perform grid search with cross-validation for Decision Tree
grid_search_dt = GridSearchCV(DecisionTreeClassifier(), param_grid_dt, cv=None, scoring='accuracy')
grid_search_dt.fit(X_scaled, y)

# Get the best Decision Tree model
best_dt_model = grid_search_dt.best_estimator_

# Predict on the test set
y_pred_dt = best_dt_model.predict(X_test)

# Calculate accuracy
accuracy_dt = accuracy_score(y_test, y_pred_dt)

# Print the best parameters found for Decision Tree
print("Best Parameters (Decision Tree):", grid_search_dt.best_params_)

# Get the mean cross-validation score for the best Decision Tree model
print("Mean Cross-validation Score (Decision Tree):", grid_search_dt.best_score_)


Best Parameters (Decision Tree): {'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2}
Mean Cross-validation Score (Decision Tree): 1.0




In [34]:
from sklearn.ensemble import RandomForestClassifier

# Create Random Forest classifier
rf = RandomForestClassifier()

# Fit the model
rf.fit(X_train, y_train)

# Predict on the test set
y_pred_rf = rf.predict(X_test)

# Calculate accuracy
accuracy_rf = accuracy_score(y_test, y_pred_rf)

# Print accuracy
print("Random Forest Accuracy:", accuracy_rf)

# Generate classification report
report_rf = classification_report(y_test, y_pred_rf)
print("Random Forest Classification Report:")
print(report_rf)


Random Forest Accuracy: 1.0
Random Forest Classification Report:
              precision    recall  f1-score   support

        High       1.00      1.00      1.00      2559
      Medium       1.00      1.00      1.00      1868
         low       1.00      1.00      1.00       720

    accuracy                           1.00      5147
   macro avg       1.00      1.00      1.00      5147
weighted avg       1.00      1.00      1.00      5147



In [35]:
from sklearn.ensemble import RandomForestClassifier

# Define Random Forest classifier
rf = RandomForestClassifier()

# Perform cross-validation
cv_scores_rf = cross_val_score(rf, X_scaled, y, cv=kf)

# Print mean cross-validation score
print("Mean Cross-validation Score (Random Forest):", np.mean(cv_scores_rf))


Mean Cross-validation Score (Random Forest): 0.9996502439623682


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Define a smaller parameter grid for Random Forest hyperparameter tuning
param_grid_rf = {
    'n_estimators': [50, 100],           # Number of trees in the forest
    'max_depth': [None, 10],             # Maximum depth of the tree
    'min_samples_split': [2, 5],         # Minimum number of samples required to split a node
    'min_samples_leaf': [1, 2]           # Minimum number of samples required at each leaf node
}

# Perform grid search with cross-validation for Random Forest
grid_search_rf = GridSearchCV(RandomForestClassifier(), param_grid_rf, cv=None, scoring='accuracy')
grid_search_rf.fit(X_scaled, y)

# Get the best Random Forest model
best_rf_model = grid_search_rf.best_estimator_

# Predict on the test set
y_pred_rf = best_rf_model.predict(X_test)

# Calculate accuracy
accuracy_rf = accuracy_score(y_test, y_pred_rf)

# Print the best parameters found for Random Forest
print("Best Parameters for Random Forest:", grid_search_rf.best_params_)

# Get the mean cross-validation score for the best Random Forest model
print("Mean Cross-validation Score for Random Forest:", grid_search_rf.best_score_)



In [None]:
from sklearn.ensemble import GradientBoostingClassifier

# Create Gradient Boosting classifier
gb = GradientBoostingClassifier()

# Fit the model
gb.fit(X_train, y_train)

# Predict on the test set
y_pred_gb = gb.predict(X_test)

# Calculate accuracy
accuracy_gb = accuracy_score(y_test, y_pred_gb)

# Print accuracy
print("Gradient Boosting Accuracy:", accuracy_gb)

# Generate classification report
report_gb = classification_report(y_test, y_pred_gb)
print("Gradient Boosting Classification Report:")
print(report_gb)


In [None]:
from sklearn.ensemble import GradientBoostingClassifier

# Define Gradient Boosting classifier
gb = GradientBoostingClassifier()

# Perform cross-validation
cv_scores_gb = cross_val_score(gb, X_scaled, y, cv=kf)

# Print mean cross-validation score
print("Mean Cross-validation Score (Gradient Boosting):", np.mean(cv_scores_gb))


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

# Define a smaller parameter grid for Gradient Boosting hyperparameter tuning
param_grid_gb = {
    'n_estimators': [50, 100],          # Number of boosting stages
    'learning_rate': [0.01, 0.1],       # Learning rate shrinks the contribution of each tree
    'max_depth': [3, 5],                # Maximum depth of the individual trees
    'min_samples_split': [2, 5],        # Minimum number of samples required to split a node
    'min_samples_leaf': [1, 2]          # Minimum number of samples required at each leaf node
}

# Perform grid search with cross-validation for Gradient Boosting
grid_search_gb = GridSearchCV(GradientBoostingClassifier(), param_grid_gb, cv=None, scoring='accuracy')
grid_search_gb.fit(X_scaled, y)

# Get the best Gradient Boosting model
best_gb_model = grid_search_gb.best_estimator_

# Predict on the test set
y_pred_gb = best_gb_model.predict(X_test)

# Calculate accuracy
accuracy_gb = accuracy_score(y_test, y_pred_gb)

# Print the best parameters found for Gradient Boosting
print("Best Parameters for Gradient Boosting:", grid_search_gb.best_params_)

# Get the mean cross-validation score for the best Gradient Boosting model
print("Mean Cross-validation Score for Gradient Boosting:", grid_search_gb.best_score_)
