## PART A

In [None]:
#importing all necessary libraries
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import cv2
import tensorflow as tf
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten, Conv2D, MaxPooling2D 
from tensorflow.keras.models import Sequential
from tensorflow.keras.regularizers import L2


In [None]:
# definig some variables
IMG_SIZE = 50
DATADIR = "C:/Assignment_images"
test_dir = "C:/test_set"
CATEGORIES = ["cats", "dogs"]


In [None]:
# creating a training data set
training_data = []
def create_training_data():
    for category in CATEGORIES:
        path = os.path.join(DATADIR, category) # path to cats or dogs directory
        class_num = CATEGORIES.index(category)
        
        for img in os.listdir(path):
            # using try and except to pass all the images that have problem in resizeing
            try: 
                img_array = cv2.imread(os.path.join(path,img))
                new_array = cv2.resize(img_array, (IMG_SIZE, IMG_SIZE))
                image_std = new_array.astype('float') / 255.0
                training_data.append([image_std, class_num])
            except Exception as e:
                pass
create_training_data()

In [None]:
# creating a testing data set
testing_data = []

def create_testing_data():
    for category in CATEGORIES:
        test_path = os.path.join(test_dir, category) # path to cats or dogs directory
        class_num = CATEGORIES.index(category)
        for img in os.listdir(test_path):
            # using try and except to pass all the images that have problem in resizeing
            try:
                img_array = cv2.imread(os.path.join(test_path,img))
                test_new_array = cv2.resize(img_array, (IMG_SIZE, IMG_SIZE))
                image_std = test_new_array.astype('float') / 255.0
                testing_data.append([image_std, class_num])
            except Exception as e:
                pass
create_testing_data()

In [None]:
import random
random.shuffle(training_data)

In [None]:
train_features = []
train_label = []
test_features = []
test_label = []

In [None]:
train_features = np.array([i[0] for i in training_data]).reshape(-1, IMG_SIZE, IMG_SIZE, 3)
test_features = np.array([i[0] for i in testing_data]).reshape(-1, IMG_SIZE, IMG_SIZE, 3)
test_label = np.array([i[1] for i in testing_data])
train_label = np.array([i[1] for i in training_data])

In [None]:
from tensorflow.keras.optimizers import Adam

opt_1 = Adam(learning_rate=0.001)

In [None]:
# creating the model and adding hyperparameters to have better accuracy
model = Sequential()

model.add(Conv2D(64, (3,3), input_shape = train_features.shape[1:]))
model.add(Activation("relu"))
model.add(MaxPooling2D(pool_size=(2,2)))

model.add(Conv2D(64, (3,3)))
model.add(Activation("relu"))
model.add(MaxPooling2D(pool_size=(2,2)))

model.add(Flatten()) # this convert our 3D features map into 1D vectors

model.add(Dense(64))
model.add(Activation("relu"))

model.add(Dense(1))
model.add(Activation("sigmoid"))

model.compile(loss="binary_crossentropy",
             optimizer = opt_1,
             metrics = ["accuracy"])

history = model.fit(train_features, train_label, batch_size=32, epochs= 10, validation_data =(test_features, test_label))
model.save('2_000.h5')

In [None]:
# plot function to plot some basic relationship between columns
def plotter(history_file):
    with open(history_file, 'rb') as file:
        history = pickle.load(file)
    
    plt.plot(history['accuracy'])
    plt.plot(history['val_accuracy'])
    plt.title('model accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['train', 'val'], loc='upper left')
    plt.show()

    plt.savefig('2_000_10epoch_accuracy.png')

    plt.plot(history['loss'])
    plt.plot(history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'val'], loc='upper left')
    plt.show()
    plt.savefig('2_000_10epoch_loss.png')

In [None]:
import pickle

history_file = '2_000_history.pickle'
with open(history_file, 'wb') as file:
    pickle.dump(history.history, file)

plotter(history_file)

## PART B 

In [None]:
# Import necessary modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
%matplotlib inline
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn import tree
from sklearn.preprocessing import MinMaxScaler
from sklearn import preprocessing, svm, metrics  
import math
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error

In [None]:
df = pd.read_excel("Downloads/cw3_housing_data_part_b.xlsx")

### Visualization and description of data set

In [None]:
df.head()

In [None]:
# Create arrays for features and target variable
y = df['price'].values
X = df['sqft_living15'].values

# Print the dimensions of X and y before reshaping
print("Dimensions of y before reshaping: {}".format(y.shape))
print("Dimensions of X before reshaping: {}".format(X.shape))

# Reshape X and y
y = y.reshape(-1, 1)
X = X.reshape(-1, 1)

# Print the dimensions of X and y after reshaping
print("Dimensions of y after reshaping: {}".format(y.shape))
print("Dimensions of X after reshaping: {}".format(X.shape))

In [None]:
sns.heatmap(df.corr(), square=True, cmap='RdYlGn')
plt.xticks(rotation=90)
plt.yticks(rotation=0)
plt.show()

In [None]:
df.info()

In [None]:
df.describe()

### LINEAR REGRESSION 

In [None]:
y = df['price'].values
X = df.drop('price', axis=1)

# Reshape to 1-D
y = y.reshape(-1, 1)
X_sqft_living = X['sqft_living'].values.reshape(-1, 1) 

_ = plt.scatter(X['sqft_living'], y, color='blue')
_ = plt.ylabel('Price Expectancy')
_ = plt.xlabel('Sqft_living')

# -----------------------
# Import LinearRegression
from sklearn.linear_model import LinearRegression

# Create the regressor: reg
reg = LinearRegression()

# Create the prediction space
prediction_space = np.linspace(min(X_sqft_living), max(X_sqft_living)).reshape(-1,1)

# Fit the model to the data
reg.fit(X_sqft_living, y)

# Compute predictions over the prediction space: y_pred
y_pred = reg.predict(prediction_space)

# Print R^2 
print(reg.score(X_sqft_living, y))

# Plot regression line
plt.plot(prediction_space, y_pred, color='black', linewidth=3)
plt.show()

In [None]:
# Create training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42)

# Create the regressor: reg_all
reg_all = LinearRegression()

# Fit the regressor to the training data
reg_all.fit(X_train, y_train)

# Predict on the test data: y_pred
y_pred = reg_all.predict(X_test)

# Compute and print R^2 and RMSE
print("R^2: {}".format(reg_all.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error: {}".format(rmse))

### DECISION TREE REGRESSION

In [None]:
X = df[['bedrooms','bathrooms', 'sqft_living', 'view', 'condition']].values
y = df['price'].values

(X_train, X_test, y_train, y_test) = train_test_split(X, y, train_size=0.7, random_state=1)

In [None]:
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)
dtc.score(X_test, y_test)

In [None]:
clf_gini = DecisionTreeClassifier(criterion = "gini", random_state = 100,
                               max_depth=3, min_samples_leaf=5)
clf_gini.fit(X_train, y_train)

clf_entropy = DecisionTreeClassifier(criterion = "entropy", random_state = 100,
max_depth=3, min_samples_leaf=5)
clf_entropy.fit(X_train, y_train)

In [None]:
y_pred = clf_gini.predict(X_test)
y_pred

In [None]:
print ("Accuracy is :", accuracy_score(y_test,y_pred)*100)

### RANDOM FOREST REGRESSION 

In [None]:
selected_features = ['bedrooms','bathrooms', 'sqft_living', 'view', 'condition']
t_val_subset = df[selected_features]

In [None]:
X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size=0.20, random_state=13)

In [None]:
clf = RandomForestRegressor(n_estimators = 3000, random_state = 10)
# Train the model on training data
clf.fit(X_train, y_train)

In [None]:
# Use the forest's predict method on the test data
prediction_s = clf.predict(X_test)
# Calculate the absolute errors
errors_s = abs(prediction_s - y_test)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors_s), 2), 'degrees.')

In [None]:
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors_s - y_test)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

### SVR

In [None]:
svr_linear = SVR(kernel='linear',gamma='scale', C=1.0, epsilon=0.1)
svr_linear.fit(X_train, y_train) 

In [None]:
svr_linear.score(X_test,y_test)

In [None]:
linear = LinearRegression()
linear.fit(X_train,y_train)

In [None]:
linear.score(X_test,y_test)

In [None]:
svr_rbf = SVR(kernel='rbf',gamma='scale', C=1.0, epsilon=0.1)
svr_rbf.fit(X_train, y_train) 

In [None]:
svr_rbf.score(X_test,y_test)

In [None]:
print("RMSE for linear SVR:",np.sqrt(mean_squared_error(y_test,svr_linear.predict(X_test))))
print("RMSE for RBF kernelized SVR:",np.sqrt(mean_squared_error(y_test,svr_rbf.predict(X_test))))