Firstly connect to the database and gather all available data.

In [2]:
import psycopg2

dbname = ""
user = ""
password = ""
host = ""  
port = ""  

try:
    connection = psycopg2.connect(
        dbname=dbname, user=user, password=password, host=host, port=port
    )
    print("Connected to the PostgreSQL database!")
except (Exception, psycopg2.Error) as error:
    print("Error while connecting to PostgreSQL:", error)
    connection = None


if connection:

    cursor = connection.cursor()
    query = "select breed, age, weight, gender, date_in, date_out from webscraper_pet;"
    cursor.execute(query)
    results = cursor.fetchall()
    cursor.close()

    query = "select distinct breed from webscraper_pet;"
    cursor = connection.cursor()
    cursor.execute(query)
    breeds = cursor.fetchall()
    cursor.close()

    # map all strings in breeds to integers
    breeds = {breed[0]: i for i, breed in enumerate(breeds)}

    breeds_reverse = {v: k for k, v in breeds.items()}

    print (len(results))
    print (len(breeds))

    print (results[:10])


    connection.close()
    print("PostgreSQL connection is closed.")


Connected to the PostgreSQL database!
28768
210
[('Mieszaniec', 6, 12, 'm', datetime.date(2023, 5, 27), datetime.date(2023, 5, 31)), ('Amstaff/Pitbull', 24, 25, 'f', datetime.date(2022, 9, 13), datetime.date(2023, 6, 1)), ('Mieszaniec', 12, 16, 'f', datetime.date(2023, 5, 22), datetime.date(2023, 5, 30)), ('Mieszaniec', 12, 9, 'f', datetime.date(2023, 5, 15), datetime.date(2023, 6, 1)), ('Europejska', 12, 4, 'f', datetime.date(2023, 5, 30), datetime.date(2023, 6, 1)), ('Mieszaniec', 48, 8, 'm', datetime.date(2023, 4, 4), datetime.date(2023, 6, 1)), ('Europejska', 24, 3, 'f', datetime.date(2023, 5, 8), datetime.date(2023, 5, 30)), ('Amstaff', 72, 35, 'f', datetime.date(2022, 11, 4), datetime.date(2023, 6, 1)), ('Bulterier', 24, 25, 'm', datetime.date(2022, 8, 31), datetime.date(2023, 6, 1)), ('Mieszaniec', 120, 9, 'm', datetime.date(2023, 5, 11), datetime.date(2023, 6, 1))]
PostgreSQL connection is closed.


Secondly prepare the data for a k-nn neural network.
Results_preprocessed will become a list of breed, age, weight and months spent in the shelter.
We also delete all classes that are very small.
We stratify the data to create train and test datasets.

In [3]:
import numpy as np
from sklearn.model_selection import train_test_split


results_preprocessed = []

for result in results:
    inputs = []

    inputs.append(breeds[result[0]])

    inputs.append(result[1])
    inputs.append(result[2])
    
    if result[3] == 'm':
        inputs.append(0)
    else:
        inputs.append(1)

    months_spent = (result[5] - result[4]).days / 30

    inputs.append(int(months_spent))

    if months_spent < 0:
        continue
    
    results_preprocessed.append(inputs)

results_preprocessed = np.array(results_preprocessed)

labels = results_preprocessed[:, -1]

classes = np.unique(labels, return_counts=True)

new_results_preprocessed = []

for result in results_preprocessed:
    for x in range(classes[0].shape[0]):
        if result[-1] == classes[0][x]:
            if classes[1][x] != 1:
                new_results_preprocessed.append(result)

results_preprocessed = np.array(new_results_preprocessed)

# split results into training and testing data
X_train, X_test, y_train, y_test = train_test_split(
    results_preprocessed[:, :-1], results_preprocessed[:, -1], test_size=0.2, stratify=results_preprocessed[:, -1])

print (X_train.shape)
print (X_test.shape)
print (y_train.shape)
print (y_test.shape)



(22998, 4)
(5750, 4)
(22998,)
(5750,)


We create a k-nn neural network for different number of neighbours. We asses the accuracy of each network. And show it as a graph.

In [5]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
import matplotlib as mpl
mpl.use('tkagg')
import matplotlib.pyplot as plt
import numpy as np

acc_score = []

for k in np.arange(1, 30):
    knn = KNeighborsClassifier(n_neighbors = k)

    knn.fit(X_train, y_train)

    predictions = knn.predict(X_test)

    acc_score.append(accuracy_score(y_test, predictions))
    
plt.plot(np.arange(1, 30), acc_score, 'o')
plt.xlabel("Number of neighbors")
plt.ylabel("Acccuracy score")
plt.show()

The results are show on this image.
![diagram](./Figure_1.png)  
The predictions are not very accurate. Lets try to understand why this is.

In [16]:
only_age = results_preprocessed[:, [1, 4]]
only_weight = results_preprocessed[:, [2, 4]]

only_age_and_weight = results_preprocessed[:, [1, 2, 4]]

#plot a graph from only age and weight
plt.scatter(only_age_and_weight[:, 0], only_age_and_weight[:, 1], c=only_age_and_weight[:, 2])
plt.show()

We get this graph. Where the x axix is the age of the animal and y axix is the weight and the color indicates the time spent in the shelter. One image is the whole graph and the other is a close up.  
![figure_2](./Figure_2.png)  
![figure_3](./Figure_3.png)  
The correlation is not easily seen. This may show that the k-nn is not suitable for this estimate.