In [3]:
import numpy as np
import scipy
import pandas as pd
import math
import random
import sklearn
from nltk.corpus import stopwords
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

In [6]:
df = pd.read_excel("../pet-set.xlsx")

In [264]:
df.head(3)

Unnamed: 0,How old is your cat (years)?,How long have you owned your cat (years)?,Is your cat male or female?,Where did you get your cat from?,Other (please specify),Is your cat neutered?,What breed is your cat?,Other (please specify).1,Does your cat have access to the outdoors?,"If yes, does your cat have constant access, or is this restricted in some way (i.e. the cat is kept in at night or is only let out when someone is home)",...,i see myself as someone who...Prefers work that is routine,"i see myself as someone who...Is outgoing, sociable",i see myself as someone who...Is sometimes rude to others,i see myself as someone who...Makes plans and follows through with them,i see myself as someone who...Gets easily nervous,i see myself as someone who...Likes to reflect and play with ideas,i see myself as someone who...Has few artistic interests,i see myself as someone who...Likes to cooperate with others,i see myself as someone who...Is easily distracted,"i see myself as someone who...Is sophisticated in art, music or literature"
0,6,6,Male,Other (please specify),Knew his mum,Yes,Domestic short/long hair,,Yes,Constant access,...,Disagree a little,Agree a little,Disagree strongly,Disagree a little,Agree a little,Disagree a little,Disagree a little,Agree a little,Agree strongly,Agree a little
1,2,1,Female,Rehoming centre,,Yes,Domestic short/long hair,,Yes,Restricted access,...,Agree strongly,Neither agree nor disagree,Neither agree nor disagree,Agree strongly,Agree strongly,Agree a little,Neither agree nor disagree,Neither agree nor disagree,Disagree strongly,Agree strongly
2,5,5,Male,Friend/neighbour,,Yes,Domestic short/long hair,,No,,...,,,,,,,,,,


In [352]:
# Overview of variables showing satisfaction level
print(df['My cat has met all my expectations'].value_counts())
print(df['I am happy with my cat'].value_counts())
print(df['I feel my cat is happy living with me'].value_counts())
print(df['I have considered relinquishing or rehoming this cat to someone else'].value_counts())

Strongly agree                2156
Agree                          722
Neither agree nor disagree     272
Disagree                        86
Strongly disagree               12
Name: My cat has met all my expectations, dtype: int64
Strongly agree                2794
Agree                          395
Neither agree nor disagree      39
Disagree                        16
Strongly disagree                4
Name: I am happy with my cat, dtype: int64
Strongly agree                2555
Agree                          627
Neither agree nor disagree      56
Disagree                         7
Strongly disagree                3
Name: I feel my cat is happy living with me, dtype: int64
Strongly disagree             2982
Disagree                       169
Agree                           57
Neither agree nor disagree      33
Strongly agree                   7
Name: I have considered relinquishing or rehoming this cat to someone else, dtype: int64


In [353]:
# replacing strings with numerical values
df['expectations'] = df['My cat has met all my expectations'].replace({'Strongly agree': 5, 'Agree': 4, 'Neither agree nor disagree': 3, 'Disagree': 2, 'Strongly disagree': 1})
df['happy_owner'] = df['I am happy with my cat'].replace({'Strongly agree': 5, 'Agree': 4, 'Neither agree nor disagree': 3, 'Disagree': 2, 'Strongly disagree': 1})
df['happy_cat'] = df['I feel my cat is happy living with me'].replace({'Strongly agree': 5, 'Agree': 4, 'Neither agree nor disagree': 3, 'Disagree': 2, 'Strongly disagree': 1})
df['rehome'] = df['I have considered relinquishing or rehoming this cat to someone else'].replace({'Strongly agree': 1, 'Agree': 2, 'Neither agree nor disagree': 3, 'Disagree': 4, 'Strongly disagree': 5})

### variable discribing the satisfaction level, might be transformed to percentage but unnecessary now

In [356]:
df['satisf_level'] = (df['expectations'] + df['happy_owner'] + df['happy_cat'] + df['rehome'])
df['satisf_level'].describe()

In [357]:
df['satisf_level'].value_counts()

20.0    1940
19.0     491
18.0     286
17.0     250
16.0     133
15.0      62
14.0      38
13.0      13
12.0      12
11.0       8
10.0       5
8.0        5
9.0        2
7.0        2
5.0        1
Name: satisf_level, dtype: int64

In [359]:
#df['rehome'].value_counts().sort_index()
#df['happy_owner'].value_counts().sort_index()
#df['happy_cat'].value_counts().sort_index()
#df['expectations'].value_counts().sort_index()

# Owners

In [281]:
# slice with owners data
owners = df.iloc[:, 58:]

In [282]:
owners.shape

(3331, 52)

In [283]:
owners = owners.dropna()

In [360]:
owners.shape

(3164, 53)

In [325]:
#creating the variable with owners ids from indexes, 30000 is just do make it distinguisable from cats ids
owners['id'] = owners.index
owners['owner_id'] = owners['id'].apply(lambda x: 30000 + x)
owners.drop('id', axis = 1, inplace = True)

In [326]:
owners.head()

Unnamed: 0,Gender,Age,How many other cats live in the household (not including the cat you have described),I see myself as someone who.....,I see myself as someone who.....Tends to find fault with others,I see myself as someone who.....Does a thorough job,"I see myself as someone who.....Is depressed, blue","I see myself as someone who.....Is original, comes up with new ideas",I see myself as someone who.....Is reserved,I see myself as someone who.....Is helpful and unselfish with others,...,i see myself as someone who...Has few artistic interests,i see myself as someone who...Likes to cooperate with others,i see myself as someone who...Is easily distracted,"i see myself as someone who...Is sophisticated in art, music or literature",expectations,happy_owner,happy_cat,rehome,satisf_level,owner_id
0,Female,25-34,2.0,Agree a little,Agree a little,Agree a little,Agree a little,Disagree a little,Agree a little,Agree strongly,...,Disagree a little,Agree a little,Agree strongly,Agree a little,5.0,5.0,5.0,5.0,20.0,30000
1,Female,25-34,0.0,Agree a little,Agree a little,Agree strongly,Disagree a little,Agree a little,Agree strongly,Agree a little,...,Neither agree nor disagree,Neither agree nor disagree,Disagree strongly,Agree strongly,5.0,5.0,5.0,5.0,20.0,30001
3,Female,18-24,1.0,Agree a little,Agree a little,Neither agree nor disagree,Agree strongly,Agree a little,Agree a little,Agree a little,...,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Disagree strongly,5.0,5.0,5.0,5.0,20.0,30003
4,Female,25-34,3.0,Disagree a little,Agree a little,Agree a little,Disagree a little,Neither agree nor disagree,Agree a little,Agree a little,...,Disagree a little,Agree a little,Agree a little,Neither agree nor disagree,5.0,5.0,5.0,5.0,20.0,30004
5,Female,45-54,2.0,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Agree a little,Agree a little,Neither agree nor disagree,Agree a little,...,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,5.0,5.0,5.0,5.0,20.0,30005


# Cats

Almost the same here for cats

In [327]:
cats = df.iloc[:, 0:53]

In [328]:
cats.shape

(3331, 53)

In [329]:
# identifying the variables with many NaN values to drop them
cats.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3331 entries, 0 to 3330
Data columns (total 53 columns):
 #   Column                                                                                                                                                                                                                 Non-Null Count  Dtype 
---  ------                                                                                                                                                                                                                 --------------  ----- 
 0   How old is your cat (years)?                                                                                                                                                                                           3331 non-null   int64 
 1   How long have you owned your cat (years)?                                                                                                                      

In [330]:
#dropping the variables with too many NaN
cats = cats.drop(cats.columns[[4, 7, 9, 12, 13, 15, 16]], axis = 1)

In [361]:
cats = cats.dropna()
cats.shape

(3259, 47)

In [362]:
# IDs for cats
cats['id'] = cats.index
cats['cat_id'] = cats['id'].apply(lambda x: 20000 + x)
cats.drop('id', axis = 1, inplace = True)
cats.head()

Unnamed: 0,How old is your cat (years)?,How long have you owned your cat (years)?,Is your cat male or female?,Where did you get your cat from?,Is your cat neutered?,What breed is your cat?,Does your cat have access to the outdoors?,Does your cat have a litter tray?,Does your cat have any existing medical conditions?,Does your cat display any behavioural problems,...,"My cat would prefer to be left alone, rather than be with people",My cat likes being stroked,My cat is very tolerant to being handled,"If my cat could choose, it would prefer to have a bowl of food rather than interaction with me","My cat has negatively changed the way he/she interacts with me since I first acquired him/her (e.g. has become more fearful, behaves aggressively, is less friendly)","My cat has positively changed in the way he/she interacts with me since I first acquired him/her (e.g. has become less fearful, behaves less aggressively, is more friendly)",My cat behaves differently with strangers than he/she does with me,My cat behaves differently with me than he/she does with other (human) members of the household,My cat is friendly,cat_id
0,6,6,Male,Other (please specify),Yes,Domestic short/long hair,Yes,No,Yes,No,...,Disagree,Agree,Neither agree nor disagree,Agree,Strongly agree,Neither agree nor disagree,Agree,Disagree,Agree,20000
1,2,1,Female,Rehoming centre,Yes,Domestic short/long hair,Yes,Yes,No,No,...,Neither agree nor disagree,Agree,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Neither agree nor disagree,Agree,Neither agree nor disagree,Strongly agree,20001
2,5,5,Male,Friend/neighbour,Yes,Domestic short/long hair,No,Yes,No,No,...,Neither agree nor disagree,Neither agree nor disagree,Disagree,Agree,Disagree,Agree,Agree,Neither agree nor disagree,Neither agree nor disagree,20002
3,4,4,Female,Friend/neighbour,Yes,Domestic short/long hair,Yes,Yes,No,No,...,Disagree,Strongly agree,Agree,Neither agree nor disagree,Strongly disagree,Strongly agree,Agree,Agree,Agree,20003
4,3,3,Female,Other (please specify),Yes,Domestic short/long hair,Yes,Yes,No,No,...,Neither agree nor disagree,Agree,Neither agree nor disagree,Agree,Strongly disagree,Strongly agree,Agree,Agree,Agree,20004


In [334]:
# indexed dataset with ids for cats and owners. Some owners don't have id, if there are NaN values.
df_i = cats.join(owners)
df_i.head()

Unnamed: 0,How old is your cat (years)?,How long have you owned your cat (years)?,Is your cat male or female?,Where did you get your cat from?,Is your cat neutered?,What breed is your cat?,Does your cat have access to the outdoors?,Does your cat have a litter tray?,Does your cat have any existing medical conditions?,Does your cat display any behavioural problems,...,i see myself as someone who...Has few artistic interests,i see myself as someone who...Likes to cooperate with others,i see myself as someone who...Is easily distracted,"i see myself as someone who...Is sophisticated in art, music or literature",expectations,happy_owner,happy_cat,rehome,satisf_level,owner_id
0,6,6,Male,Other (please specify),Yes,Domestic short/long hair,Yes,No,Yes,No,...,Disagree a little,Agree a little,Agree strongly,Agree a little,5.0,5.0,5.0,5.0,20.0,30000.0
1,2,1,Female,Rehoming centre,Yes,Domestic short/long hair,Yes,Yes,No,No,...,Neither agree nor disagree,Neither agree nor disagree,Disagree strongly,Agree strongly,5.0,5.0,5.0,5.0,20.0,30001.0
2,5,5,Male,Friend/neighbour,Yes,Domestic short/long hair,No,Yes,No,No,...,,,,,,,,,,
3,4,4,Female,Friend/neighbour,Yes,Domestic short/long hair,Yes,Yes,No,No,...,Neither agree nor disagree,Neither agree nor disagree,Agree a little,Disagree strongly,5.0,5.0,5.0,5.0,20.0,30003.0
4,3,3,Female,Other (please specify),Yes,Domestic short/long hair,Yes,Yes,No,No,...,Disagree a little,Agree a little,Agree a little,Neither agree nor disagree,5.0,5.0,5.0,5.0,20.0,30004.0


In [365]:
# dropping rows with NaN values
df_i = df_i.dropna()
df_i.shape

(3164, 100)

# Split

In [366]:
train_df, test_df = train_test_split(df_i,
                                   test_size=0.20,
                                   random_state=42)
print('# Train set: %d' % len(train_df))
print('# Test set: %d' % len(test_df))

# Train set: 2531
# Test set: 633


# Trash Bin

Did that for collaborative filtering, but it seems not fiting for us

In [368]:
#satisfaction = df_i.iloc[:, [46,98,99]]
#satisfaction
#satisfaction['satisf_level'].value_counts()