In [1]:
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split 
from sklearn.neighbors import KNeighborsClassifier
from joblib import dump,load
import reverse_geocoder as rg


In [2]:
# reading question with text file for wrangling
data=pd.read_csv("questions_with_text.csv")
data.drop(['city','ZIP','lat','Long'], axis=1, inplace=True)
target_label=data.state
features=data
features.drop(['state','user_id'], axis=1, inplace=True)
#features.head()


# Training:
   . encode labels and save encoder.
   . train a classifier to take text and return lat+Long
# At inference time:
   . take user input and return lat+Long    
   . convert lat and Long to closest city by reverse_geocoder[for python3]
      

In [3]:
# removing points outside US
#coordinate = (data['lat'][0], data['Long'][0])
#rg.search(coordinate)

In [4]:
"""def cc_from_coordinates(data,i):
    location_data=data.iloc[i,:]
    lat= location_data.lat
    lng= location_data.Long
    return (rg.search((lat,lng))[0].get('cc'))
"""    




"def cc_from_coordinates(data,i):\n    location_data=data.iloc[i,:]\n    lat= location_data.lat\n    lng= location_data.Long\n    return (rg.search((lat,lng))[0].get('cc'))\n"

In [5]:
#cc_from_coordinates(data,0)

In [6]:
# i tried filtering points based on whether or not 
# they're in US and find out search is a time botteneck
# so i am using all points now
"""country_info=[]
for i,row in data.iterrows():
    country_info.append(cc_from_coordinates(row))
"""    

'country_info=[]\nfor i,row in data.iterrows():\n    country_info.append(cc_from_coordinates(row))\n'

## one hot encoding

In [7]:
encoded_features=pd.get_dummies(features)
# in order to match with this our user responses 
# we will use align() function with a left join which requires dataset to
# join to ,so we create a dataset of single row and delete it after align+join
encoded_features.columns = encoded_features.columns.str.strip().str.lower().str.replace(' ', '_').str.replace(',', '')
data_structure=encoded_features.head(1).copy()
for col in data_structure.columns:
    data_structure[col].values[:]=0
data_structure.to_csv("empty_data_structure.csv" ,index=False)

In [8]:
# train test split
X=encoded_features
y=target_label
X_train, X_test, y_train_state, y_test_state = train_test_split(X, y, test_size=0.05, random_state=42 ,shuffle=True)


In [9]:
state_5nn = KNeighborsClassifier(n_neighbors=5)
state_5nn.fit(X_train, y_train_state)
print(state_5nn.predict(X_test))

['Vermont' 'Connecticut' 'Massachusetts' ... 'Massachusetts' 'Connecticut'
 'Connecticut']


In [10]:
dump(state_5nn ,"state_level_knn.joblib.dat")

['state_level_knn.joblib.dat']

In [11]:
def testmodel(model,x_test,y_test,n):
    eval=model.predict(x_test[:n])==y_test[:n]
    return (sum(eval)/n)

In [12]:
testmodel(state_5nn,X_test,y_test_state,400)

0.1625

In [13]:
pred_prob=state_5nn.predict_proba(X_test)
# indexes of three highest probabilty classes
top_3 = np.argsort(pred_prob, axis=1)[:,-3:]
# top 3 errors
y_test_copy=y_test_state.reset_index(drop=True)
error_counts=0
for index,value in y_test_copy.items():
    current_results=[state_5nn.classes_[i] for i in top_3[index]]
    error_counts+=y_test_state.iloc[index] in current_results
print("top 3 error" ,error_counts / y_test_state.shape[0])     


top 3 error 0.4409340659340659


In [14]:
# functions to get top3 answers from models
def get_top_3(model,data):
    pred_prob=state_5nn.predict_proba(data)
    # indexes of three highest probabilty classes
    top_3 = np.argsort(pred_prob, axis=1)[:,-3:]
    results=[model.classes_[i] for i in top_3]
    return results


In [15]:
get_top_3(model=state_5nn,data=X_test[:1])

[array(['New Hampshire', 'Puerto Rico', 'Vermont'], dtype=object)]

In [16]:
encoding_sample= pd.read_csv("empty_data_structure.csv")


In [17]:
sample_data=features.head(1).copy()
sample_data.to_csv("sample_case_data.csv" ,index=False)

In [18]:
input_data= pd.read_csv("sample_case_data.csv")


In [19]:
input_data.head()

Unnamed: 0,Q01,Q02,Q03,Q04,Q05,Q06,Q07,Q08,Q09,Q10,...,Q15,Q16,Q17,Q18,Q19,Q20,Q21,Q22,Q23,Q24
0,y’all,I have no idea what this creature is,"a freeway has limited access (no stop lights, ...",I have no word for this,painter,I have no general word for this,patio sale,"Mary and marry are pronounced the same, but me...",neither,"a freeway has limited access (no stop lights, ...",...,other,other,other,other,other,other,other,other,other,other


In [20]:
# encode it
encoded_data = encoding_sample.align(pd.get_dummies(input_data),join = "left", axis = 1)

In [21]:
encoded_data = encoded_data[1].fillna(0)
# convert na's to 0 (since we're one hot encoding)
# encoded_data = encoded_data[1].fillna(0)

In [22]:
encoded_data.describe()

Unnamed: 0,q01_other,q01_yinz,q01_you,q01_you_all,q01_you_guys,q01_you_lot,q01_you_’uns,q01_yous,q01_youse,q01_y’all,...,q23_drinking_fountain,q23_other,q23_water_bubbler,q23_water_fountain,q24_i_have_no_word_for_this,q24_i_use_lightning_bug_and_firefly_interchangeably,q24_firefly,q24_lightning_bug,q24_other,q24_peenie_wallie
count,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
mean,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
std,,,,,,,,,,,...,,,,,,,,,,
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
def encode_answers(input_data):
    '''Reads in the sample encoded data w/ correct columns and 
    converts input data to the same format'''
    # read in empty dataframe with correct columns
    encoding_sample = pd.read_csv("empty_data_structure.csv")

    # encode it
    encoded_data = encoding_sample.align(pd.get_dummies(input_data),
    join = "left", axis = 1)

    # convert na's to 0 (since we're one hot encoding)
    encoded_data = encoded_data[1].fillna(0)
    
    return(encoded_data)

In [24]:
def get_top_3_knn(data):
    '''Read in the knn model and apply it to correctly formatted sample data'''
    # read in model
    state_knn = load("state_level_knn.joblib.dat")

    # encode input data
    encoded_data = encode_answers(data)

    pred = state_knn.predict_proba(encoded_data)
    top_3 = np.argsort(pred, axis=1)[ : ,-3 : ]
    results = [state_knn.classes_[i] for i in top_3]

    return(results[0].tolist())

In [35]:
from typing import Dict, Text, Any, List, Union, Optional

class ClassifierPipeline_knn():
    """Load in calssifier & encoders"""

    def name(self) -> Text:
        """Unique identifier of the classfier """

        return "5knn_state"

    def encode_answers(self, input_data):
        '''Reads in the sample encoded data w/ correct columns and 
        converts input data to the same format'''
        # read in empty dataframe with correct columns
        encoding_sample = pd.read_csv("empty_data_structure.csv")

        # encode it
        encoded_data = encoding_sample.align(pd.get_dummies(input_data),
        join = "left", axis = 1)

        # convert na's to 0 (since we're one hot encoding)
        encoded_data = encoded_data[1].fillna(0)
        
        return(encoded_data)

    def get_top_3_knn(self, data):
        '''Read in the knn model and apply it to correctly formatted sample data'''
        # read in model
        state_knn = load("state_level_knn.joblib.dat")

        # encode input data
        encoded_data = self.encode_answers(data)

        pred = state_knn.predict_proba(encoded_data)
        top_3 = np.argsort(pred, axis=1)[ : ,-3 : ]
        results = [state_knn.classes_[i] for i in top_3]

        return(results[0].tolist())

In [32]:
encode_answers(encoding_sample)

Unnamed: 0,q01_other,q01_yinz,q01_you,q01_you_all,q01_you_guys,q01_you_lot,q01_you_’uns,q01_yous,q01_youse,q01_y’all,...,q23_drinking_fountain,q23_other,q23_water_bubbler,q23_water_fountain,q24_i_have_no_word_for_this,q24_i_use_lightning_bug_and_firefly_interchangeably,q24_firefly,q24_lightning_bug,q24_other,q24_peenie_wallie
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [31]:
encoding_sample = pd.read_csv("empty_data_structure.csv")

In [33]:
get_top_3_knn(encoding_sample)

['Maine', 'Massachusetts', 'Vermont']

In [38]:
ClassifierPipeline_knn().get_top_3_knn(input_data)

['Maine', 'Massachusetts', 'Vermont']

In [49]:
sample_data=features.head(2).copy()

In [50]:
sample_data=sample_data.iloc[:,2]

In [51]:
ClassifierPipeline_knn().get_top_3_knn(sample_data)

['Maine', 'Massachusetts', 'Vermont']