## Housing Project
### Shoshi Finkel

In [384]:
import pandas as pd
import pymysql
import csv
import psutil
import psycopg2
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score
from sklearn import tree
from sklearn.metrics import confusion_matrix, recall_score, precision_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error , r2_score, mean_absolute_error

In [385]:
# Read and explore the data.
df = pd.read_csv('Housing.csv')

In [386]:
# Create a connection.
conn = pymysql.connect(host='localhost', port=3306, user='root', password='Sf@323895474', db='sys')
cursor = conn.cursor()

In [387]:
# Create a new Database.
cursor.execute('CREATE DATABASE IF NOT EXISTS HousingDB;')
cursor.execute('USE HousingDB;')

0

In [388]:
# cursor.execute('''DROP TABLE IF EXISTS HouseFeatures''')

In [389]:
# Create a table.
cursor.execute('''CREATE TABLE IF NOT EXISTS HouseFeatures(
    house_id MEDIUMINT UNSIGNED AUTO_INCREMENT PRIMARY KEY,
    price BIGINT,
    area INT,
    bedrooms SMALLINT,
    bathrooms SMALLINT,
    stories SMALLINT,
    mainroad NVARCHAR(3),
    guestroom NVARCHAR(3),
    basement NVARCHAR(3),
    hotwaterheating NVARCHAR(3),
    airconditioning NVARCHAR(3),
    parking SMALLINT,
    prefarea NVARCHAR(3),
    furnishingstatus NVARCHAR(20)
)''')

0

In [390]:
# Load the dataset from the csv file to the new table.
with open ('Housing.csv', 'r') as f:
    reader = csv.reader(f)
    columns = next(reader) 
    query = 'insert into HouseFeatures({0}) values ({1})'
    query = query.format(','.join(columns), ','.join(['%s'] * len(columns)))
    cursor = conn.cursor()
    for data in reader:
        cursor.execute(query, data)

In [391]:
# Read the db into pandas df.
df = pd.read_sql_query("SELECT * FROM HouseFeatures", conn)

  df = pd.read_sql_query("SELECT * FROM HouseFeatures", conn)


In [392]:
df.head()

Unnamed: 0,house_id,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,7631,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,7632,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,7633,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,7634,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,7635,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [393]:
#Print out the number of features and observations in the dataset.
df.shape

(545, 14)

In [394]:
# Print data types and null counts for each column.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   house_id          545 non-null    int64 
 1   price             545 non-null    int64 
 2   area              545 non-null    int64 
 3   bedrooms          545 non-null    int64 
 4   bathrooms         545 non-null    int64 
 5   stories           545 non-null    int64 
 6   mainroad          545 non-null    object
 7   guestroom         545 non-null    object
 8   basement          545 non-null    object
 9   hotwaterheating   545 non-null    object
 10  airconditioning   545 non-null    object
 11  parking           545 non-null    int64 
 12  prefarea          545 non-null    object
 13  furnishingstatus  545 non-null    object
dtypes: int64(7), object(7)
memory usage: 59.7+ KB


In [395]:
# Count the number of nulls.
df.isnull().sum()

house_id            0
price               0
area                0
bedrooms            0
bathrooms           0
stories             0
mainroad            0
guestroom           0
basement            0
hotwaterheating     0
airconditioning     0
parking             0
prefarea            0
furnishingstatus    0
dtype: int64

In [396]:
#Describe the data
df.describe()

Unnamed: 0,house_id,price,area,bedrooms,bathrooms,stories,parking
count,545.0,545.0,545.0,545.0,545.0,545.0,545.0
mean,7903.0,4766729.0,5150.541284,2.965138,1.286239,1.805505,0.693578
std,157.47222,1870440.0,2170.141023,0.738064,0.50247,0.867492,0.861586
min,7631.0,1750000.0,1650.0,1.0,1.0,1.0,0.0
25%,7767.0,3430000.0,3600.0,2.0,1.0,1.0,0.0
50%,7903.0,4340000.0,4600.0,3.0,1.0,2.0,0.0
75%,8039.0,5740000.0,6360.0,3.0,2.0,2.0,1.0
max,8175.0,13300000.0,16200.0,6.0,4.0,4.0,3.0


In [397]:
# Drop the house_id column.
df.drop('house_id', axis=1, inplace=True)

In [398]:
# Encode the categorical columns.
cat_cols = ['bedrooms', 'bathrooms', 'stories',
       'mainroad', 'guestroom', 'basement', 'hotwaterheating',
       'airconditioning', 'parking', 'prefarea', 'furnishingstatus']
def make_categories(df, col_list):
    for col in col_list:
        category=pd.Categorical(df[col])
        df[col]=category.codes
    return df
make_categories(df, cat_cols)

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,3,1,2,1,0,0,0,1,2,1,0
1,12250000,8960,3,3,3,1,0,0,0,1,3,0,0
2,12250000,9960,2,1,1,1,0,1,0,0,2,1,1
3,12215000,7500,3,1,1,1,0,1,0,1,3,1,0
4,11410000,7420,3,0,1,1,1,1,0,1,2,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
540,1820000,3000,1,0,0,1,0,1,0,0,2,0,2
541,1767150,2400,2,0,0,0,0,0,0,0,0,0,1
542,1750000,3620,1,0,0,1,0,0,0,0,0,0,2
543,1750000,2910,2,0,0,0,0,0,0,0,0,0,0


In [399]:
# Find the correlated columns.
correlation = df.corr()
correlation['price']

price               1.000000
area                0.535997
bedrooms            0.366494
bathrooms           0.517545
stories             0.420712
mainroad            0.296898
guestroom           0.255517
basement            0.187057
hotwaterheating     0.093073
airconditioning     0.452954
parking             0.384394
prefarea            0.329777
furnishingstatus   -0.304721
Name: price, dtype: float64

In [400]:
# Save the correlated columns.
#df = df[['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'airconditioning', 'parking', 'prefarea', 'furnishingstatus']]

In [401]:
# Split the data into features and labels.
X = df.drop('price', axis = 1)
y = df['price']

In [402]:
# Split the data into training and test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.15, random_state=417)

In [403]:
#Normalize the data.
mms = MinMaxScaler()
X_train_scaled = mms.fit_transform(X_train)
X_test_scaled = mms.fit_transform(X_test)

In [404]:
# Instantiate, fit and predict with a Random Forest Classifier model.
rf = RandomForestClassifier()
rf.fit(X_train_scaled, y_train)
rf_pred = rf.predict(X_test_scaled)
rf_accuracy = accuracy_score(rf_pred, y_test)
print(rf_accuracy)

0.036585365853658534


In [405]:
# Track the cpu and virtual memory.
cpu_percent_before = psutil.cpu_percent()
virtual_memory_used_before = psutil.virtual_memory().used
virtual_memory_free_before = psutil.virtual_memory().free
virtual_memory_percent_before = psutil.virtual_memory().percent
available_memory_before = psutil.virtual_memory().available * 100 / psutil.virtual_memory().total

In [406]:
# Instantiate linear Regression model fit linear regression mode.

#lr = LinearRegression(fit_intercept = False) #Tweak the model's parameters- this didn't do better.
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)

# Obtain predictions for test set.
predictions = lr.predict(X_test_scaled)

# Obtain R Squared Score of test set.
r = lr.score(X_test_scaled, y_test)
print("R-square score:",r)

# Obtain the RMSE of test set.
RMSE = mean_squared_error(y_test, predictions, squared = False)
print("RMSE:",RMSE)

# Obtain the MAE of test set.
MAE = mean_absolute_error(y_test, predictions)
print("MAE:",MAE)



R-square score: 0.7132323844497191
RMSE: 1024772.59976935
MAE: 802494.3089542751


In [407]:
# Track the cpu and memory after model.
cpu_percent_after = psutil.cpu_percent()
virtual_memory_percent_after = psutil.virtual_memory().percent
virtual_memory_used_after = psutil.virtual_memory().used
virtual_memory_free_after = psutil.virtual_memory().free
available_memory_after = psutil.virtual_memory().available * 100 / psutil.virtual_memory().total

In [408]:
# Calculate the model usage.
cpu_percent = cpu_percent_after - cpu_percent_before
virtual_memory_used = virtual_memory_used_after - virtual_memory_used_before
virtual_memory_free = virtual_memory_free_after - virtual_memory_free_before
virtual_memory_percent = virtual_memory_percent_after - virtual_memory_percent_before
available_memory = available_memory_after - available_memory_before

print('cpu_percent:', cpu_percent, '\navailable_memory:', available_memory, '\nvirtual_memory_percent:', virtual_memory_percent, '\nvirtual_memory_used:', virtual_memory_used, '\nvirtual_memory_free:', virtual_memory_free)


cpu_percent: 7.1 
available_memory: -0.0016479223576535418 
virtual_memory_percent: 0.0 
virtual_memory_used: 1179648 
virtual_memory_free: -638976


In [409]:
# Close the connection.
conn.close()