In [1]:
from sklearn.model_selection import train_test_split
from config import *
from utils import *

## Read and Process Data

#### (a) Original Training Data with All Textual Features Removed

Removed name, description, rescuerID and petID columns. 

In [2]:
X, y = get_data(TRAIN_PATH)
X.head(2)

Unnamed: 0,Type,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,FurLength,Vaccinated,Dewormed,Sterilized,Health,Quantity,Fee,State,VideoAmt,PhotoAmt
0,2,3.0,299,0,1,1,7,0,1,1,2,2,2,1,1.0,100.0,41326,0.0,1.0
1,2,1.0,265,0,1,1,2,0,2,2,3,3,3,1,1.0,0.0,41401,0.0,2.0


In [3]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)
X_train, train_scaler = normalization(X_train, numerical_features)
X_train.head(2)

Unnamed: 0,Type,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,FurLength,Vaccinated,Dewormed,Sterilized,Health,Quantity,Fee,State,VideoAmt,PhotoAmt
14215,1,-0.419811,141,307,2,2,0,0,2,1,1,1,2,1,-0.370284,-0.276471,41326,-0.169331,0.286534
14019,1,-0.098076,307,0,2,1,2,0,2,1,3,3,3,3,-0.370284,-0.276471,41326,-0.169331,0.843164


In [4]:
X_val = normalization(X_val, numerical_features, train_scaler)
X_val.head(2)

Unnamed: 0,Type,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,FurLength,Vaccinated,Dewormed,Sterilized,Health,Quantity,Fee,State,VideoAmt,PhotoAmt
4924,2,-0.366189,281,295,2,1,6,0,2,1,1,1,1,1,-0.370284,-0.276471,41326,-0.169331,-0.270097
10850,1,0.384525,307,0,2,4,5,0,2,1,1,1,1,1,-0.370284,-0.276471,41326,-0.169331,-0.548412


#### (b) Data with Sentiment Scores Added

A sentiment score is generated for each sentence in a pet's description by using Google's Natural Language API. For each sentence, the API provides both a sentiment score and a sentiment magnitude.

To calculate the overall sentiment score for the entire description, the product of the sentiment score and the sentiment magnitude are summed. Positive scores and negative scores are summed seperately. 

In [5]:
X_senti, y_senti = get_data(SENTIMENT_INCLUDED_TRAIN_PATH)
X_senti.head(2)

Unnamed: 0,Type,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,FurLength,...,Dewormed,Sterilized,Health,Quantity,Fee,State,VideoAmt,PhotoAmt,PositiveSentimentScore,NegativeSentimentScore
0,2,3.0,299,0,1,1,7,0,1,1,...,2,2,1,1.0,100.0,41326,0.0,1.0,2.0,-0.2
1,2,1.0,265,0,1,1,2,0,2,2,...,3,3,1,1.0,0.0,41401,0.0,2.0,0.1,-0.6


In [6]:
X_train_senti, X_val_senti, y_train_senti, y_val_senti = train_test_split(X_senti, y_senti, test_size=0.2, random_state=RANDOM_STATE)

X_train_senti, train_scaler_senti = normalization(X_train_senti, numerical_features+senti_features)
X_train_senti.head(2)

Unnamed: 0,Type,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,FurLength,...,Dewormed,Sterilized,Health,Quantity,Fee,State,VideoAmt,PhotoAmt,PositiveSentimentScore,NegativeSentimentScore
3665,2,-0.421537,266,266,3,1,4,6,1,2,...,2,2,1,3.28322,-0.271369,41326,-0.169777,0.289677,-1.001695,0.538373
11787,1,-0.367009,307,0,2,2,7,0,2,1,...,1,2,1,-0.364227,0.969286,41326,-0.169777,-0.270278,-0.419623,0.538373


In [7]:
X_val_senti = normalization(X_val_senti, numerical_features+senti_features, train_scaler_senti)
X_val_senti.head(2)

Unnamed: 0,Type,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,FurLength,...,Dewormed,Sterilized,Health,Quantity,Fee,State,VideoAmt,PhotoAmt,PositiveSentimentScore,NegativeSentimentScore
9507,1,-0.530594,70,307,1,1,2,0,2,1,...,2,2,1,-0.364227,-0.271369,41326,-0.169777,1.129609,0.615172,-4.06949
14759,1,1.377895,307,0,1,1,0,0,2,2,...,1,1,1,-0.364227,-0.271369,41401,-0.169777,-0.270278,-0.484298,0.538373
