In [41]:
from tpot import TPOTClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer


In [2]:
import pandas as pd

In [203]:
years = [pd.read_csv('%d.csv' % year) for year in range(2007, 2018)]

df = pd.concat(years)

df.DogName = df.DogName.apply(str.upper)

In [204]:
def no_mix(breed):
    return breed.rstrip(' MIX')

df['breed_no_mix'] = df.Breed.apply(no_mix)

In [205]:
breed_counts = df.breed_no_mix.value_counts()

In [206]:
name_counts = df.DogName.value_counts()

In [309]:
main_breeds = df[(df.breed_no_mix.map(breed_counts.to_dict()) > 500) 
                 & (df.Breed != 'MIXED')
                & (df.DogName.map(name_counts.to_dict()) > 100) ]

Unnamed: 0,LicenseType,Breed,Color,DogName,OwnerZip,ExpYear,ValidDate,breed_no_mix
1,Dog Individual Female,AM PIT BULL TERRIER,BROWN,SABLE,15001,2007,5/1/2007 15:15,AM PIT BULL TERRIER
8,Dog Senior Citizen or Disability Spayed Female,POMERANIAN,TAN,TAFFY,15003,2007,3/12/2007 15:57,POMERANIAN
9,Dog Individual Spayed Female,BEAGLE,SPOTTED,BELLE,15003,2007,1/26/2007 9:24,BEAGLE
10,Dog Individual License Duplicate,BEAGLE,SPOTTED,BELLE,15003,2007,1/26/2007 9:47,BEAGLE
11,Dog Individual Female,AM ESKIMO DOG,WHITE,SASHA,15003,2007,5/25/2007 11:31,AM ESKIMO DOG


In [104]:
cv = CountVectorizer(analyzer='char_wb', ngram_range=(3,3), min_df = 100)

features = cv.fit_transform(main_breeds.DogName)

In [105]:
pipeline_optimizer = TPOTClassifier(verbosity=3,
                                    config_dict='TPOT light')

X_train, X_test, y_train, y_test = train_test_split(features, main_breeds.breed_no_mix) 

19 operators have been imported by TPOT.


In [110]:
pipeline_optimizer.fit(X_train.toarray(), y_train)

                                                                                   



TPOT closed prematurely. Will use the current best pipeline.




RuntimeError: A pipeline has not yet been optimized. Please call fit() first.

In [None]:
pipeline_optimizer.score()

In [91]:
from sklearn.linear_model import LogisticRegression

In [106]:
bob_log = LogisticRegression(class_weight='balanced')

bob_log.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [107]:
bob_log.score(X_test, y_test)

0.10246734397677794

In [108]:
bob_log.predict(X_test)

array(['SIB HUSKY', 'TAG', 'CHIHUAHUA', ..., 'CHIHUAHUA', 'CHIHUAHUA',
       'DACHSHUND'], dtype=object)

In [109]:
features

<27559x376 sparse matrix of type '<type 'numpy.int64'>'
	with 95151 stored elements in Compressed Sparse Row format>

In [236]:
name_prop = main_breeds.DogName.value_counts()/len(main_breeds)

In [237]:
count_filter = main_breeds.groupby(['breed_no_mix', 'DogName'])['LicenseType'].count() >= 10

In [238]:
breed_proportions = main_breeds.groupby(['breed_no_mix', 'DogName'])['LicenseType'].count()/main_breeds.groupby('breed_no_mix')['LicenseType'].count()

In [239]:
breed_proportions = breed_proportions[count_filter]

In [240]:
pd.DataFrame(breed_proportions.subtract(name_prop, level='DogName').divide(breed_proportions)).reset_index().sort_values(by=0)


Unnamed: 0,breed_no_mix,DogName,0
142,AM PIT BULL TERRIER,MOLLY,-7.092801
131,AM PIT BULL TERRIER,MAGGIE,-6.107242
204,AM PIT BULL TERRIER,TOBY,-3.742046
103,AM PIT BULL TERRIER,JACK,-3.451040
3728,SHIH TZU,JAKE,-3.410901
778,BOXER,ANGEL,-3.193605
3707,SHIH TZU,DUKE,-3.040194
3669,SHIH TZU,BEAR,-2.976986
2574,LABRADOR RETRIEVER,BABY,-2.878976
1068,CHIHUAHUA,DUKE,-2.601129


In [182]:
from scipy.stats import chisquare

In [194]:
breed = 'CHIHUAHUA'

name = 'COOPER'

filtered = df[df['breed_no_mix'] == breed]
  
f_obs = [(filtered.DogName == name).sum(), (filtered.DogName != name).sum()]

f_exp = [(df.DogName == name).sum(), (df.DogName != name).sum()]

chisquare(f_obs, f_exp)

Power_divergenceResult(statistic=98686.285279397533, pvalue=0.0)

In [256]:
f_obs = (in_breed / len(in_breed)).values

f_exp = (all_breeds / len(in_breed)).values

In [257]:
all_breeds = main_breeds.groupby('DogName')['LicenseType'].count()[in_breed.index]

chisquare(f_obs, f_exp)

Power_divergenceResult(statistic=275.81147120854649, pvalue=0.013180549431962316)

In [339]:
names_by_breed = main_breeds.groupby(['breed_no_mix', 'DogName'])['LicenseType'].count()
names = main_breeds.groupby('DogName')['DogName'].count()
breeds = main_breeds.groupby('breed_no_mix')['breed_no_mix'].count()
tabulated = pd.DataFrame(names_by_breed).join(names).join(breeds)
tabulated.columns = ['name_in_breed', 'name_total', 'breed_total']
tabulated['total_dogs'] = len(main_breeds)
tabulated['name_not_breed'] = tabulated.name_total - tabulated.name_in_breed
tabulated['breed_not_name'] = tabulated.breed_total - tabulated.name_in_breed
tabulated['not_name_not_breed'] = tabulated.total_dogs - tabulated.name_not_breed
tabulated['breed_odds'] = tabulated.name_in_breed/tabulated.breed_not_name
tabulated['not_breed_odds'] = tabulated.name_not_breed/tabulated.not_name_not_breed
tabulated['odds_ratio'] = tabulated.breed_odds/tabulated.not_breed_odds

In [344]:
tabulated.sample(10).apply(
                lambda row: fisher_exact(
                                [[row.name_in_breed, row.name_not_breed],
                                [row.breed_not_name, row.not_name_not_breed]]),
                            axis=1
                            )

ValueError: Shape of passed values is (10, 2), indices imply (10, 10)

In [311]:
from scipy.stats import fisher_exact

breed_name_pairs = []

counter = 0
for breed in main_breeds.breed_no_mix.unique():
    for name in main_breeds.DogName.unique():
        name_breed = main_breeds[(main_breeds.DogName == name) & (main_breeds.breed_no_mix == breed)]['LicenseType'].count()
        name_not_breed = main_breeds[(main_breeds.DogName == name) & (main_breeds.breed_no_mix != breed)]['LicenseType'].count()
        not_name_breed = main_breeds[(main_breeds.DogName != name) & (main_breeds.breed_no_mix == breed)]['LicenseType'].count()
        not_name_not_breed = main_breeds[(main_breeds.DogName != name) & (main_breeds.breed_no_mix != breed)]['LicenseType'].count()

        table = [[name_breed, not_name_breed], [name_not_breed, not_name_not_breed]]

        ratio, p_value = fisher_exact(table, alternative='two-sided')
        breed_name_pairs.append({'breed': breed,
                                'name': name,
                                 'ratio': ratio,
                                 'p_value': p_value
                                })
        

KeyboardInterrupt: 

In [None]:
odds_ratios = pd.DataFrame(breed_name_pairs)

In [None]:
odds_ratios[(odds_ratios.p_value < 0.1) & (odds_ratios.breed == 'BEAGLE')].sort_values(by='ratio')