In [1]:
# Importing the neccessary libraries
import numpy as np
import pandas as pd
from sodapy import Socrata
from sklearn import preprocessing

## Reading the dataset

In [2]:
# Unauthenticated client only works with public data sets. Note 'None'
# in place of application token, and no username or password:
client = Socrata("data.sonomacounty.ca.gov", None)

# First 30000 results, returned as JSON from API / converted to Python list of
# dictionaries by sodapy.
results = client.get("924a-vesw", limit=30000)

# Convert to pandas DataFrame
results_df = pd.DataFrame.from_records(results)



In [3]:
results_df.head()

Unnamed: 0,name,type,breed,color,sex,size,date_of_birth,impound_number,kennel_number,id,...,outcome_type,outcome_subtype,intake_condition,outcome_condition,intake_jurisdiction,outcome_jurisdiction,zip_code,location,intake_total,:@computed_region_dig5_f3vy
0,CUPCAKE,DOG,MALTESE/POODLE TOY,WHITE,Spayed,TOY,2014-10-06T00:00:00.000,K23-045263,DA04,A328255,...,ADOPTION,SCAS WEB,UNKNOWN,PENDING,SANTA ROSA,COUNTY,95441.0,"{'latitude': '38.70854', 'longitude': '-122.90...",1,23750.0
1,,CAT,DOMESTIC SH,BLACK,Female,SMALL,2017-08-08T00:00:00.000,K23-045663,CS05,A419545,...,,,UNKNOWN,,COUNTY,,,,1,
2,LOLI,DOG,CHIHUAHUA SH,WHITE/BROWN,Spayed,TOY,2010-08-31T00:00:00.000,K23-045646,CS04,A267380,...,,,UNKNOWN,,SANTA ROSA,SANTA ROSA,95401.0,"{'latitude': '38.44366000000008', 'longitude':...",1,26075.0
3,LUCY,DOG,CHIHUAHUA SH/MIX,BLACK/BROWN,Spayed,SMALL,2020-08-04T00:00:00.000,K23-045635,DS57,A419489,...,,,UNKNOWN,,COUNTY,COUNTY,95436.0,"{'latitude': '38.47872000000007', 'longitude':...",1,23747.0
4,,CAT,SIAMESE,SEAL PT,Female,SMALL,2019-08-08T00:00:00.000,K23-045662,CS05,A419544,...,,,UNKNOWN,,COUNTY,,,,1,


In [11]:
results_df[results_df['type']=='OTHER']['breed'].unique()

array(['POTBELLY PIG', 'CHICKEN', 'RABBIT SH', 'BAT', 'PEAFOWL',
       'PARAKEET', 'MOUSE', 'GOAT', 'KOI', 'GUINEA PIG', 'PIG', 'DUCK',
       'HAMSTER', 'REX', 'MULE', 'SHEEP', 'LOP-MINI', 'PIGEON', 'RAT',
       'BAT/MEX FREE-TAIL', 'DUCK/MUSCOVY', 'AMERICAN', 'HARLEQUIN/MIX',
       'LOP-MINI/MIX', 'RABBIT LH', 'LOP-ENGLISH', 'PALOMINO', 'TURKEY',
       'LOP-HOLLAND', 'HORSE', 'COCKATIEL', 'RACCOON', 'PERUVIAN PASO',
       'TURTLE', 'GOOSE', 'ANGORA-SATIN/MIX', 'AMERICAN/REX', 'FINCH',
       'CHINCHILLA', 'PARROT', 'DUTCH', 'FLEMISH GIANT', 'SQUIRREL',
       'LOP-FRENCH', 'CALIFORNIAN', 'SKUNK', 'QUARTER HORSE', 'LILAC',
       'LOP-HOLLAND/MIX', 'OPOSSUM', 'BANTAM', 'GOAT/NUBIAN',
       'AMERICAN SABLE/MIX', 'DOVE', 'PONY', 'LOP-AMER FUZZY',
       'ENGLISH SPOT/MIX', 'CHECKERED GIANT', 'HIMALAYAN/MIX', 'REX/MIX',
       'ANGORA-SATIN', 'DUCK/CAYUGA', 'GOAT/BOER', 'NETHERLND DWARF',
       'NEW ZEALAND WHT', 'BOER', 'RHINELANDER/MIX', 'TORTOISE', 'ANGUS',
       'CANARY', 'QU

In [4]:
results_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25866 entries, 0 to 25865
Data columns (total 25 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   name                         19077 non-null  object
 1   type                         25866 non-null  object
 2   breed                        25866 non-null  object
 3   color                        25866 non-null  object
 4   sex                          25866 non-null  object
 5   size                         25831 non-null  object
 6   date_of_birth                19447 non-null  object
 7   impound_number               25866 non-null  object
 8   kennel_number                25840 non-null  object
 9   id                           25866 non-null  object
 10  intake_date                  25866 non-null  object
 11  outcome_date                 25643 non-null  object
 12  days_in_shelter              25866 non-null  object
 13  intake_type                  25

In [5]:
# Checking missing values
results_df.isna().sum()

name                           6789
type                              0
breed                             0
color                             0
sex                               0
size                             35
date_of_birth                  6419
impound_number                    0
kennel_number                    26
id                                0
intake_date                       0
outcome_date                    223
days_in_shelter                   0
intake_type                       0
intake_subtype                    0
outcome_type                    229
outcome_subtype                 549
intake_condition                  0
outcome_condition               576
intake_jurisdiction               0
outcome_jurisdiction           3722
zip_code                       3770
location                       3770
intake_total                      0
:@computed_region_dig5_f3vy    3785
dtype: int64

## Creating extra features 

In [6]:
# Creating "has_name" boolean column from "name" column
results_df["has_name"] = results_df["name"].apply(lambda x: 0 if pd.isnull(x) else 1)
results_df.drop("name", axis=1, inplace=True)

In [7]:
# Calculating the "age" column

# Converting all the date columns into datetime format
results_df["date_of_birth"] = pd.to_datetime(results_df['date_of_birth'])
results_df["intake_date"] = pd.to_datetime(results_df['intake_date'])

# Calculate age by subtracting date_of_birth from intake_date
results_df['age'] = (results_df['intake_date'] - results_df['date_of_birth']).dt.days // 365

# Convert age column to bins and set NaN values to 'UNK'
age_bins = [0, 3, 6, 9, float('inf')]
age_labels = ['0-3', '3-6', '6-9', '9+']

# Create Categorical column with specified categories
results_df['age'] = pd.cut(results_df['age'], bins=age_bins, labels=age_labels, right=False)

# Set NaN values in age_bin to 'UNK'
results_df['age'] = results_df['age'].cat.add_categories(['UNK']).fillna('UNK')

results_df.head()

Unnamed: 0,type,breed,color,sex,size,date_of_birth,impound_number,kennel_number,id,intake_date,...,intake_condition,outcome_condition,intake_jurisdiction,outcome_jurisdiction,zip_code,location,intake_total,:@computed_region_dig5_f3vy,has_name,age
0,DOG,MALTESE/POODLE TOY,WHITE,Spayed,TOY,2014-10-06,K23-045263,DA04,A328255,2023-07-05,...,UNKNOWN,PENDING,SANTA ROSA,COUNTY,95441.0,"{'latitude': '38.70854', 'longitude': '-122.90...",1,23750.0,1,6-9
1,CAT,DOMESTIC SH,BLACK,Female,SMALL,2017-08-08,K23-045663,CS05,A419545,2023-08-08,...,UNKNOWN,,COUNTY,,,,1,,0,6-9
2,DOG,CHIHUAHUA SH,WHITE/BROWN,Spayed,TOY,2010-08-31,K23-045646,CS04,A267380,2023-08-07,...,UNKNOWN,,SANTA ROSA,SANTA ROSA,95401.0,"{'latitude': '38.44366000000008', 'longitude':...",1,26075.0,1,9+
3,DOG,CHIHUAHUA SH/MIX,BLACK/BROWN,Spayed,SMALL,2020-08-04,K23-045635,DS57,A419489,2023-08-04,...,UNKNOWN,,COUNTY,COUNTY,95436.0,"{'latitude': '38.47872000000007', 'longitude':...",1,23747.0,1,3-6
4,CAT,SIAMESE,SEAL PT,Female,SMALL,2019-08-08,K23-045662,CS05,A419544,2023-08-08,...,UNKNOWN,,COUNTY,,,,1,,0,3-6


## Dropping unncessary records and columns

In [8]:
# Dropping rows with missing values in "size" and "outcome_type" columns
results_df.dropna(subset=['size'], inplace=True)
results_df.dropna(subset=['outcome_type'], inplace=True)

In [9]:
# Dropping the columns which are not required
results_df.drop(["date_of_birth", "impound_number", "kennel_number", "id", "outcome_subtype", "outcome_condition", 
                 "outcome_jurisdiction", "outcome_date", "zip_code", "location", ":@computed_region_dig5_f3vy"],
                axis=1, inplace=True)

In [10]:
# Coverting the column type for "days_in_shelter" into int format
results_df["days_in_shelter"] = results_df["days_in_shelter"].astype(int)

In [11]:
# Convert the "intake__date" column type to epoch time ---- WHY
results_df['intake_date'] = results_df['intake_date'].astype('int64') // 10**9

In [12]:
results_df.head()

Unnamed: 0,type,breed,color,sex,size,intake_date,days_in_shelter,intake_type,intake_subtype,outcome_type,intake_condition,intake_jurisdiction,intake_total,has_name,age
0,DOG,MALTESE/POODLE TOY,WHITE,Spayed,TOY,1688515200,34,STRAY,FIELD,ADOPTION,UNKNOWN,SANTA ROSA,1,1,6-9
5,CAT,DOMESTIC SH,ORG TABBY/WHITE,Spayed,KITTN,1685404800,70,STRAY,OVER THE COUNTER,ADOPTION,UNKNOWN,COUNTY,1,1,0-3
6,CAT,DOMESTIC SH,BRN TABBY/WHITE,Female,SMALL,1689379200,24,STRAY,OVER THE COUNTER,TRANSFER,UNKNOWN,SANTA ROSA,1,1,9+
7,DOG,POODLE MIN,WHITE,Neutered,SMALL,1682380800,85,STRAY,FIELD,ADOPTION,UNKNOWN,SANTA ROSA,1,1,UNK
9,DOG,PUG,FAWN,Male,SMALL,1691452800,0,STRAY,OVER THE COUNTER,RETURN TO OWNER,HEALTHY,COUNTY,1,1,0-3


## Splitting data into predictor and target variables

In [13]:
X = results_df.drop("outcome_type", axis=1)
y = results_df["outcome_type"]

In [14]:
# Identify categorical and numerical columns
categorical_columns = X.select_dtypes(include=['object', 'category']).columns
numerical_columns = X.select_dtypes(include=['int64', 'int32']).columns

print("Categorical Columns:\n", categorical_columns)
print("\nNuerical Columns:\n", numerical_columns)

Categorical Columns:
 Index(['type', 'breed', 'color', 'sex', 'size', 'intake_type',
       'intake_subtype', 'intake_condition', 'intake_jurisdiction',
       'intake_total', 'age'],
      dtype='object')

Nuerical Columns:
 Index(['intake_date', 'days_in_shelter', 'has_name'], dtype='object')


## Applying label encoding to all categorical columns 

In [15]:
# label_encoder object knows 
# how to understand word labels.
label_encoder = preprocessing.LabelEncoder()
  
# Encode labels
X_label = X[categorical_columns].apply(label_encoder.fit_transform)
# y_label = label_encoder.fit_transform(y)

In [16]:
X_label.head()

Unnamed: 0,type,breed,color,sex,size,intake_type,intake_subtype,intake_condition,intake_jurisdiction,intake_total,age
0,1,650,293,3,5,6,3,3,11,1,2
5,0,409,218,3,0,6,20,3,9,1,0
6,0,409,94,0,4,6,20,3,11,1,3
7,1,805,293,2,4,6,3,3,11,1,4
9,1,829,153,1,4,6,20,0,9,1,0


In [17]:
processed_data = X_label.join(X[numerical_columns])
processed_data = processed_data.join(y)
processed_data = processed_data.reset_index(drop = True)

processed_data.head()

Unnamed: 0,type,breed,color,sex,size,intake_type,intake_subtype,intake_condition,intake_jurisdiction,intake_total,age,intake_date,days_in_shelter,has_name,outcome_type
0,1,650,293,3,5,6,3,3,11,1,2,1688515200,34,1,ADOPTION
1,0,409,218,3,0,6,20,3,9,1,0,1685404800,70,1,ADOPTION
2,0,409,94,0,4,6,20,3,11,1,3,1689379200,24,1,TRANSFER
3,1,805,293,2,4,6,3,3,11,1,4,1682380800,85,1,ADOPTION
4,1,829,153,1,4,6,20,0,9,1,0,1691452800,0,1,RETURN TO OWNER


## Saving the final processed data

In [18]:
# processed_data.to_csv("processed_data.csv", index=False)