# Baseline Tests

Here we test XGBoost with TFID features

In [2]:
import os

import imageio
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, cohen_kappa_score, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import GaussianNB

from sklearn.feature_selection import SelectFromModel, SelectKBest, f_regression
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline

from tqdm import tqdm

import matplotlib.pyplot as plt
%matplotlib inline

## With kaggle submit.

In [151]:
data_dir = "../data"

selected_columns = ["Type", "Age", "Breed1", "Breed2", "Gender", "Color1", "Color2", "Color3",
                    "MaturitySize", "FurLength", "Vaccinated", "Dewormed", "Sterilized", "Health",
                    "Quantity", "Fee", "State", "VideoAmt", "PhotoAmt", "Description"]

label_column = "AdoptionSpeed"

In [152]:
# Read train.csv
# TODO: rm n-rows
train = pd.read_csv(os.path.join(data_dir, "train.csv"), sep=',')
pet_ids = train["PetID"]

In [211]:

test = pd.read_csv(os.path.join(data_dir, "test.csv"), sep=',')
test_pet_ids = test["PetID"]

In [212]:
len(test_pet_ids)

3972

In [213]:
X = train[selected_columns]
X_test = test[selected_columns]

In [189]:
len(X)

14993

In [214]:
def normalize(df_train, df_test, columns):
    result = df_train.copy()
    result_2 = df_test.copy()
    for feature_name in columns:
        max_value = df_train[feature_name].max()
        min_value = df_train[feature_name].min()
        result[feature_name] = (df_train[feature_name] - min_value) / (max_value - min_value)
        result_2[feature_name] = (df_test[feature_name] - min_value) / (max_value - min_value)
    return result, result_2

In [215]:
X_train_norm, X_test_norm = normalize(X, X_test, ["Age", "Fee"]) #, "Quantity", "VideoAmt", "PhotoAmt"])


In [217]:
len_train = len(X_train_norm)
concated_df = pd.concat([X_train_norm, X_test_norm], axis=0)

In [218]:
X_dummies = pd.get_dummies(concated_df, columns=["Type", "Breed1", "Breed2", "Gender", "Color1", "Color2", "Color3", "MaturitySize",                                     
                                       "FurLength", "Vaccinated", "Dewormed", "Sterilized", "Health", "State"])

In [219]:
X_train_dummies = X_dummies.iloc[:len_train,]
X_test_dummies = X_dummies.iloc[len_train:,]

In [138]:
from xgboost import XGBClassifier


In [221]:
import numbers
import nltk
import string
from sklearn.feature_extraction.text import TfidfVectorizer


max_features = 500

# Vectorizer from https://www.kaggle.com/adamschroeder/countvectorizer-tfidfvectorizer-predict-comments?fbclid=IwAR1dRU97Bn8Ldj6KMwMNn6Rm6HQJqMV9aTbQ0jtoqgywZWd_W7fIioDrpVc

texts_train = [x.lower() if not isinstance(x, numbers.Real) else "" for x in X_train_dummies.Description]
# Remove punctuation
texts_train = [''.join(c for c in x if c not in string.punctuation) for x in texts_train]
# Remove numbers
texts_train = [''.join(c for c in x if c not in '0123456789') for x in texts_train]
# Trim extra whitespace
texts_train = [' '.join(x.split()) for x in texts_train]


test_texts = [x.lower() if not isinstance(x, numbers.Real) else "" for x in X_test_dummies.Description]
# Remove punctuation
test_texts = [''.join(c for c in x if c not in string.punctuation) for x in test_texts]
# Remove numbers
test_texts = [''.join(c for c in x if c not in '0123456789') for x in test_texts]
# Trim extra whitespace
test_texts = [' '.join(x.split()) for x in test_texts]



# Define tokenizer
def tokenizer(text):
    words = nltk.word_tokenize(text)
    return words

# Create TF-IDF of texts
tfidf = TfidfVectorizer(tokenizer=tokenizer, stop_words='english', max_features=max_features, ngram_range=(1,3),
                       max_df=0.95, min_df=0.01)
sparse_tfidf_texts = tfidf.fit_transform(texts_train)


test_sparse_tfidf_texts = tfidf.transform(test_texts)



train_df = pd.DataFrame(sparse_tfidf_texts.toarray(), columns=tfidf.get_feature_names(), index=X_train_dummies.index)
test_df = pd.DataFrame(test_sparse_tfidf_texts.toarray(), columns=tfidf.get_feature_names(), index=X_test_dummies.index)




X_train_new = pd.concat([X_train_dummies, train_df], axis=1)
X_test_new = pd.concat([X_test_dummies, test_df], axis=1)


X_train_new = X_train_new.drop("Description", axis=1)
X_test_new = X_test_new.drop("Description", axis=1)

## Code for train-test split

```
best_test_score = -1
best_n = None

model = XGBClassifier(verbosity=2, max_depth=6)
print("fitting")
model.fit(X_train_new, y_train)

print("predicting")
#rf = RandomForestClassifier(n_estimators=n).fit(X_train_new, y_train)
pred_test = model.predict(X_test_new)
pred_train = model.predict(X_train_new)
cohen_kappa = cohen_kappa_score(y_test, pred_test, weights="quadratic")
train_score = accuracy_score(y_train, pred_train)
test_score = accuracy_score(y_test, pred_test)
#print(n)
print("cohen kappa =", cohen_kappa)
print("train score =", train_score)
print("test score =", test_score)


fitting
predicting
cohen kappa = 0.3554781435697453
train score = 0.691090995712244
test score = 0.41640729212983546
```


In [203]:
X_train_new.shape
#X_test_new.shape

(14993, 900)

In [223]:
best_test_score = -1
best_n = None

y_train = train["AdoptionSpeed"]


# Max depth hyperparameter was found by finding best results with Kappa and not much overfitting.
model = XGBClassifier(verbosity=2, max_depth=6)
print("fitting")
model.fit(X_train_new, y_train)

print("predicting")
#rf = RandomForestClassifier(n_estimators=n).fit(X_train_new, y_train)
pred_test = model.predict(X_test_new)
pred_train = model.predict(X_train_new)
#cohen_kappa = cohen_kappa_score(y_test, pred_test, weights="quadratic")
train_score = accuracy_score(y_train, pred_train)
#test_score = accuracy_score(y_test, pred_test)
#print(n)
#print("cohen kappa =", cohen_kappa)
print("train score =", train_score)
#print("test score =", test_score)


fitting
predicting
train score = 0.6407656906556393


In [224]:
len(test["PetID"])

3972

In [225]:
submission_df = pd.DataFrame({"PetID" : test["PetID"], "AdoptionSpeed": pred_test})
submission_df.to_csv("baseline.csv")