# Rodent Inspection

## Cargamos las librerías

In [1]:
from sklearn.metrics import accuracy_score
import pandas as pd
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.linear_model import LogisticRegression 
from sklearn.neighbors import KNeighborsClassifier
import pickle
from sklearn.preprocessing import LabelEncoder 

## Cargamos los datos

In [2]:
#data = pd.read_csv("/src/utils/Rodent_Inspection.csv")
data = pd.read_csv("/content/drive/MyDrive/Rodent_inspection/data/Rodent_Inspection.csv")

In [3]:
data.head()

Unnamed: 0,job_ticket_or_work_order_id,job_id,inspection_type,boro_code,zip_code,latitude,longitude,inspection_date,result
0,12702949,PC6905724,Initial,2,10464,40.849413,-73.783634,12/26/2019 03:07:52 PM,Passed
1,12813800,PC6674598,Compliance,2,10458,40.867578,-73.893214,03/22/2012 11:56:22 AM,Passed
2,11477545,PC6511011,Initial,1,10026,40.804475,-73.957622,11/07/2019 10:52:01 AM,Rat Activity
3,11208055,PC7355300,Initial,4,11356,40.786391,-73.832821,11/03/2010 11:20:15 AM,Failed for Other R
4,13037125,PC7222942,Compliance,3,11207,40.656963,-73.893995,09/07/2018 09:45:09 AM,Passed


In [4]:
data.columns

Index(['job_ticket_or_work_order_id', 'job_id', 'inspection_type', 'boro_code',
       'zip_code', 'latitude', 'longitude', 'inspection_date', 'result'],
      dtype='object')

## Entrenamiento

Dividimos en train y test

In [5]:
  X = data.drop(['result'], axis=1)
  y = data['result']
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=12345)

## Función para obtener los pkl de los mejores modelos

In [6]:
def complete_model(data):
  columns_to_drop = ["job_ticket_or_work_order_id","inspection_date"]
  data = data.drop(columns_to_drop,axis=1)
  data = data.drop(data[data.result == "Bait applied"].index)
  data.drop_duplicates()
  def conditions(s):
    if (s['result'] == "Passed") or (s['result'] == "Monitoring visit"):
      return 0
    else:
      return 1
  data['result'] = data.apply(conditions, axis=1)
  Insp = pd.get_dummies(data['inspection_type'])
  Insp=Insp.join(data.job_id)
  data = pd.merge(data.drop(['inspection_type'], axis = 1),Insp, on="job_id")
  le = LabelEncoder()
  data['boro_code'] = le.fit_transform(data['boro_code'])
  data['result'] = le.fit_transform(data['result'])
  data['job_id'] = le.fit_transform(data['job_id'].astype(str))

  X = data.drop(['result'], axis=1)
  y = data['result']
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=12345)
      
  # xgboost
  model_xgb = xgb.XGBClassifier(
      n_estimators=10,
      max_depth=7,
      learning_rate=0.4,
      colsample_bytree=0.6,
      missing=-999,
      random_state=66)
  model_xgb.fit(X_train, y_train)
  pickle.dump(model_xgb, open('/content/drive/MyDrive/Rodent_inspection/data/entrenamiento_xgb.pkl', 'wb'))
 
  # logistic regression
  model_lr = LogisticRegression(
      penalty = 'l2',
      C= 1,
      solver='lbfgs')
  model_lr.fit(X_train, y_train)
  pickle.dump(model_lr, open('/content/drive/MyDrive/Rodent_inspection/data/entrenamiento_lr.pkl', 'wb'))

  # KNN
  model_knn=KNeighborsClassifier(
      n_neighbors=10,
      weights='uniform',
      algorithm='auto')
  model_knn.fit(X_train, y_train)
  pickle.dump(model_knn, open('/content/drive/MyDrive/Rodent_inspection/data/entrenamiento_knn.pkl', 'wb'))
        
  acc_xgb = accuracy_score(y_test, pickle.load(open('/content/drive/MyDrive/Rodent_inspection/data/entrenamiento_xgb.pkl', 'rb')).predict(X_test))
  acc_lr = accuracy_score(y_test, pickle.load(open('/content/drive/MyDrive/Rodent_inspection/data/entrenamiento_lr.pkl', 'rb')).predict(X_test))
  acc_knn = accuracy_score(y_test, pickle.load(open('/content/drive/MyDrive/Rodent_inspection/data/entrenamiento_knn.pkl', 'rb')).predict(X_test))
        
        
  acc_diccionario = {"XGB": acc_xgb, "LR": acc_lr, "KNN": acc_knn}
  print("####### Las precisiones de los modelos son: ", acc_diccionario)
                
  mejor_modelo=max(acc_diccionario, key=acc_diccionario.get)
  acc_mejor_modelo=max(acc_diccionario.values())
                
  print("####### Mejor modelo: ", mejor_modelo)
  print("####### El accuracy del mejor modelo es: ", acc_mejor_modelo)

  

## Desempeño

In [7]:
complete_model(pd.concat([X_train, y_train], axis=1))

####### Las precisiones de los modelos son:  {'XGB': 0.740441027715962, 'LR': 0.7037983006271495, 'KNN': 0.7198816508193405}
####### Mejor modelo:  XGB
####### El accuracy del mejor modelo es:  0.740441027715962


## Re-entrenamiento

Con el 10% de la muestra restante

In [8]:
complete_model(pd.concat([X_test, y_test], axis=1))

####### Las precisiones de los modelos son:  {'XGB': 0.7315564017134698, 'LR': 0.710851975249881, 'KNN': 0.7072822465492623}
####### Mejor modelo:  XGB
####### El accuracy del mejor modelo es:  0.7315564017134698


Muy parecido

In [9]:
# Aquí se muestra el uso del pkl con la base completa
# Load from file
pkl_filename = "/content/drive/MyDrive/Rodent_inspection/data/entrenamiento_lr.pkl"
with open(pkl_filename, 'rb') as file:
  pickle_model = pickle.load(file)

#Limpiamos
columns_to_drop = ["job_ticket_or_work_order_id","inspection_date"]
data1 = data.drop(columns_to_drop,axis=1)
data1 = data1.drop(data1[data1.result == "Bait applied"].index)
data1.drop_duplicates()
def conditions(s):
  if (s['result'] == "Passed") or (s['result'] == "Monitoring visit"):
    return 0
  else:
    return 1
data1['result'] = data1.apply(conditions, axis=1)
Insp = pd.get_dummies(data1['inspection_type'])
Insp=Insp.join(data1.job_id)
data1 = pd.merge(data1.drop(['inspection_type'], axis = 1),Insp, on="job_id")
le = LabelEncoder()
data1['boro_code'] = le.fit_transform(data1['boro_code'])
data1['result'] = le.fit_transform(data1['result'])
data1['job_id'] = le.fit_transform(data1['job_id'].astype(str))

X = data1.drop(['result'], axis=1)
y = data1['result']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

# Calculate the accuracy score and predict target values
score = pickle_model.score(X_test, y_test)
print("Test score: {0:.2f} %".format(100 * score))
Ypredict = pickle_model.predict(X_test)
      

Test score: 42.78 %


In [10]:
Ypredict

array([1, 1, 1, ..., 1, 1, 1])

## Ejemplo con un dato nuevo

In [11]:
d = {'job_id': [55874], 'boro_code': [3], 'zip_code': [12345], 'latitude': [40.825241], 'longitude': [-73.988733], 'BAIT': [1], 'CLEAN_UPS': [0], 'Compliance': [0], 'Initial': [0], 'STOPPAGE': [0]}
df = pd.DataFrame(d)

In [12]:
df

Unnamed: 0,job_id,boro_code,zip_code,latitude,longitude,BAIT,CLEAN_UPS,Compliance,Initial,STOPPAGE
0,55874,3,12345,40.825241,-73.988733,1,0,0,0,0


In [13]:
pickle_model.predict(df)

array([1])