In [None]:
# Import Dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report



In [None]:
# Read in CSV file
perth_houses_df = pd.read_csv("Resources/all_perth_310121.csv")

# Review the DataFrame
perth_houses_df.head()

### Data PreProcessing and Cleaning

In [None]:
perth_houses_df.describe().T

In [None]:
#find the datatypes
perth_houses_df.info()

In [None]:
#find null values of any of the columns
perth_houses_df.isnull().sum()

In [None]:
#change garage 'NULL' to mean figure of 2 and Build year to mean of 1989
perth_houses_df['GARAGE'] = perth_houses_df['GARAGE'].fillna(2)
perth_houses_df.tail(5)

In [None]:
#change garage and build_year columns from float to integer
perth_houses_df['GARAGE'] = perth_houses_df['GARAGE'].astype(int)
perth_houses_df.head()

In [None]:
#check breakdown of number of garages per house
perth_houses_df["GARAGE"].value_counts()

In [None]:
# remove all Garage values of <2
# value_counts = perth_houses_df['GARAGE'].value_counts()
# filter_classification = value_counts[value_counts >1]
# print(filter_classification)

In [None]:
#check breakdown of number of bedrooms per house
perth_houses_df["BEDROOMS"].value_counts()


In [None]:
#check breakdown of number of bathrooms per house
perth_houses_df["BATHROOMS"].value_counts()


In [None]:
#drop columns not required for ML 
perth_houses_cleaned_df = perth_houses_df.drop(['ADDRESS','CBD_DIST','NEAREST_STN_DIST','BUILD_YEAR','DATE_SOLD','LATITUDE','LONGITUDE','NEAREST_SCH_DIST','NEAREST_SCH_RANK'], axis=1)
perth_houses_cleaned_df.head()


In [None]:
perth_houses_cleaned_df.isnull().sum()

### Split Data into Training and Test Sets

In [None]:
# Separate the y variable, the labels
y = perth_houses_cleaned_df["PRICE"]

# Separate the X variable, the features
X = perth_houses_cleaned_df.drop(columns="PRICE")

In [None]:
# Review the y variable Series
y[:5]

In [None]:
# Review the X variable DataFrame
X[:5]

In [None]:
X = pd.get_dummies(X)
X[:5]

In [None]:
# create a histogram for the y values
plt.hist(y, bins=10, edgecolor='black')
plt.xlabel('House Prices $m')
plt.ylabel('Frequency')
plt.title('Histogram of Perth House Prices')
plt.show()



### Split Data into Training and Test Datasets

In [None]:
# Split the data into training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
X_train.shape   

### Create a Logistic Regression Model with the Cleaned Data

In [None]:
# Instantiate the Logistic Regression model
logistic_regression_model = LogisticRegression(solver='lbfgs', max_iter=500, random_state=1)
logistic_regression_model

In [None]:
# fit the model using the training data
lr_model = logistic_regression_model.fit(X_train, y_train)

In [None]:
# Use the Test data to make a prediction
lr_model.predict(X_test)

In [None]:
 # Score the model
print(f"Training Data Score: {lr_model.score(X_train, y_train)}")
print(f"Testing Data Score: {lr_model.score(X_test, y_test)}")

In [None]:

# Get the number of features
num_features = X_train.shape[1]

# Create a figure and axes for the line chart
fig, ax = plt.subplots()

# Iterate over each feature
for i in range(num_features):
    # Get the values of the feature in x_train and x_test
    train_values = X_train[:, i]
    test_values = X_test[:, i]

    # Generate the x-axis indices
    indices = np.arange(len(train_values))

    # Plot the feature values as lines
    ax.plot(indices, train_values, label='x_train Feature {}'.format(i+1))
    ax.plot(indices, test_values, label='x_test Feature {}'.format(i+1))

# Set the title, x-label, and y-label
ax.set_title('Comparison of x_train and x_test')
ax.set_xlabel('Index')
ax.set_ylabel('Feature Values')

# Display the legend
ax.legend()

# Show the plot
plt.show()
