In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
train = pd.read_csv('train.csv')
train.dropna(inplace = True)
dogs = train[train['Type'] == 1]

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA

def get_compound(s):
    sia = SIA()
    pol_score = sia.polarity_scores(s)
    return pol_score['compound']

In [18]:


dogs['DescLength'] = dogs['Description'].apply(lambda x: len(x))
dogs['DescNumWords'] = dogs['Description'].apply(lambda x: x.count(' ') + 1)
dogs['DescPolite'] = dogs['Description'].apply(lambda x: x.count('please') + x.count('Please') + x.count('thank') + x.count('Thank'))
dogs['DescNumAdj'] = dogs['Description'].apply(lambda x: len([token for token, pos in nltk.pos_tag(nltk.word_tokenize(x)) if pos.startswith('JJ')]))
dogs['DescNumVerb'] = dogs['Description'].apply(lambda x: len([token for token, pos in nltk.pos_tag(nltk.word_tokenize(x)) if pos.startswith('VB')]))
dogs['DescCompound'] = dogs['Description'].apply(lambda x: get_compound(x))
dogs['AdoptedOrNot'] = dogs['AdoptionSpeed'].apply(lambda x: 1 if x < 4 else 0)

dogs = dogs.sample(frac=1).reset_index(drop=True)

In [19]:
train_final = dogs.head(4488)
eval_final = dogs.tail(2992)

In [22]:
train_final.columns

Index(['Type', 'Name', 'Age', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2',
       'Color3', 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed',
       'Sterilized', 'Health', 'Quantity', 'Fee', 'State', 'RescuerID',
       'VideoAmt', 'Description', 'PetID', 'PhotoAmt', 'AdoptionSpeed',
       'DescLength', 'DescNumWords', 'DescPolite', 'DescNumAdj', 'DescNumVerb',
       'DescCompound', 'AdoptedOrNot'],
      dtype='object')

In [37]:
features = ['DescPolite', 'DescNumWords', 'DescLength', 'DescNumAdj', 'DescNumVerb', 'DescCompound']
X = train_final.loc[:, features]
y = train_final['AdoptedOrNot']

In [38]:
X_eval = eval_final.loc[:, features]
y_eval = eval_final['AdoptedOrNot']

In [39]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()
clf.fit(X, y)
print('train score is %f      eval score is %f' %(clf.score(X, y), clf.score(X_eval, y_eval)))

train score is 0.995321      eval score is 0.640709


In [40]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors = 1)
neigh.fit(X, y)
print('train score is %f      eval score is %f' %(neigh.score(X, y), neigh.score(X_eval, y_eval)))

train score is 0.993761      eval score is 0.632687


In [41]:
from sklearn.svm import LinearSVC
linsvc = LinearSVC()
linsvc.fit(X, y)
print('train score is %f      eval score is %f' %(linsvc.score(X, y), linsvc.score(X_eval, y_eval)))

train score is 0.706105      eval score is 0.719586


In [42]:
from sklearn.svm import SVC
clf_svc = SVC(kernel = 'rbf')
clf_svc.fit(X, y)
print('train score is %f      eval score is %f' %(clf_svc.score(X, y), clf_svc.score(X_eval, y_eval)))

train score is 0.902184      eval score is 0.728275


In [43]:
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(C = 0.1)
log_reg.fit(X, y)
print('train score is %f      eval score is %f' %(log_reg.score(X, y), log_reg.score(X_eval, y_eval)))

train score is 0.706774      eval score is 0.718917


In [44]:
dogs['AdoptedOrNot'].sum()

5323

In [45]:
dogs.shape

(7480, 31)

In [46]:
dogs['AdoptionSpeed'].value_counts()

4    2157
2    2010
3    1848
1    1310
0     155
Name: AdoptionSpeed, dtype: int64

In [47]:
2157 / 7480

0.28836898395721927

In [7]:
dogs[dogs['AdoptionSpeed'] == 4]['Age'].max()

255

In [8]:
dogs[dogs['AdoptionSpeed'] < 4]['Age'].max()

168