In [137]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import csr_matrix

nyc_df = pd.read_csv("../data/raw/nyc_restaurants.csv")

nyc_drop_na_df = nyc_df.dropna()
nyc_mod_target_df = nyc_drop_na_df.query("grade == ['A', 'B', 'C']")
nyc_mod_target_df.loc[nyc_mod_target_df['grade'] != 'A', 'grade'] = 'F'
# nyc_mod_target_df.loc[nyc_mod_target_df["grade"]=="A"] = 0
# nyc_mod_target_df.loc[nyc_mod_target_df["grade"]=="F"] = 1
nyc_mod_target_df['grade'].value_counts()

nyc_mod_zipcode_df = nyc_mod_target_df.copy()
nyc_mod_zipcode_df['zipcode'] = nyc_mod_target_df['zipcode'].apply(int).apply(str)
top_20_zipcode = ['10019', '10003', '10036', '10013', '10001', '10002', '10016', '10022', '10011', '11201', 
                  '11354', '10012', '11220', '10014', '11372', '10017', '10018', '11215', '11211', '10009' ]
nyc_mod_zipcode_df.loc[nyc_mod_zipcode_df.query("zipcode != @top_20_zipcode").index, 'zipcode'] = 'other_zipcode'


nyc_mod_cuisine_df = nyc_mod_zipcode_df.copy()
nyc_mod_cuisine_df.loc[nyc_mod_cuisine_df[nyc_mod_cuisine_df['cuisine_description'].map(nyc_mod_cuisine_df['cuisine_description'].value_counts()) < 600].index, 'cuisine_description'] = 'Other_cuisine'

nyc_mod_violation_des_df = nyc_mod_cuisine_df.copy()
nyc_mod_violation_des_df.loc[nyc_mod_violation_des_df[nyc_mod_violation_des_df['violation_description'].map(nyc_mod_violation_des_df['violation_description'].value_counts()) < 500].index, 'violation_description'] = 'Other_violation_des'

nyc_final_df = nyc_mod_violation_des_df
nyc_final_df.head()



Unnamed: 0,camis,dba,boro,zipcode,cuisine_description,inspection_date,action,violation_code,violation_description,critical_flag,score,grade,inspection_type
0,50039667,WIDDI HALL,BROOKLYN,11220,Middle Eastern,06/02/2016,Violations were cited in the following area(s).,10F,Non-food contact surface improperly constructe...,Not Critical,9.0,A,Pre-permit (Operational) / Re-inspection
3,50032889,SHUN YUEN CHINESE RESTAURANT,BRONX,other_zipcode,Chinese,03/29/2018,Violations were cited in the following area(s).,04M,Live roaches present in facility's food and/or...,Critical,12.0,A,Cycle Inspection / Initial Inspection
6,50058588,SWEETGREEN 55TH AND PARK,MANHATTAN,10022,Other_cuisine,10/30/2018,Violations were cited in the following area(s).,08A,Facility not vermin proof. Harborage or condit...,Not Critical,12.0,A,Cycle Inspection / Initial Inspection
9,41591198,PEARL'S SOCIAL & BILLY CLUB,BROOKLYN,other_zipcode,American,03/06/2018,Violations were cited in the following area(s).,10B,Plumbing not properly installed or maintained;...,Not Critical,6.0,A,Cycle Inspection / Initial Inspection
10,50007219,GREENPOINT BEER & ALE,BROOKLYN,other_zipcode,American,10/04/2016,Violations were cited in the following area(s).,09C,Food contact surface not properly maintained.,Not Critical,7.0,A,Cycle Inspection / Initial Inspection


In [138]:
test_df["grade"].value_counts(normalize=True)

0    0.82808
1    0.17192
Name: grade, dtype: float64

In [139]:
#nyc_final_df["dba"] = [str (item) for item in nyc_final_df["dba"]]
#nyc_final_df["violation_description"] = [str (item) for item in nyc_final_df["violation_description"]]
for i in nyc_final_df["violation_description"]:
    if type(i) != str:
        print(i)

In [140]:
# Train & test split
train_df, test_df = train_test_split(nyc_final_df, test_size=0.25, random_state=123)

train_df.loc[train_df["grade"]=="A", "grade"] = 0
train_df.loc[train_df["grade"]=="F", "grade"] = 1

test_df.loc[test_df["grade"]=="A", "grade"] = 0
test_df.loc[test_df["grade"]=="F", "grade"] = 1

X_train = train_df.drop(columns=["grade"])
y_train = train_df["grade"]
X_test = test_df.drop(columns=["grade"])
y_test = test_df["grade"]

In [142]:
for i in X_train["dba"]:
    if type(i) != str:
        print(i)

In [143]:
y_train = pd.to_numeric(y_train)
y_test = pd.to_numeric(y_test)

In [144]:
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import cross_val_score, cross_validate

cross_val_results = {}
dc = DummyClassifier()
cross_val_results['dummy'] = pd.DataFrame(cross_validate(dc, X_train, y_train, return_train_score=True, scoring="f1")).agg(['mean', 'std']).round(3).T
cross_val_results['dummy']

Unnamed: 0,mean,std
fit_time,0.012,0.004
score_time,0.005,0.003
test_score,0.0,0.0
train_score,0.0,0.0


In [146]:
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression

# features
target = ['grade']
categorical_features = ['boro', 'zipcode', 'cuisine_description', 'action', 'violation_code', 'critical_flag', 'inspection_type']
passthrough_features = ['score']
drop_features = ['camis', 'inspection_date']
text_features = 'dba'
text_features_2 = 'violation_description'

# column transformer
preprocessor = make_column_transformer( 
    ("passthrough", passthrough_features),  
    (OneHotEncoder(handle_unknown="ignore", sparse=False), categorical_features),  
    (CountVectorizer(max_features=2000, stop_words="english"), text_features),
    (CountVectorizer(max_features=2000, stop_words="english"), text_features_2),
    ("drop", drop_features)
)

pipe_lr = make_pipeline(preprocessor, LogisticRegression(random_state=123, max_iter=1000))
cross_val_results['logreg'] = pd.DataFrame(cross_validate(pipe_lr, X_train, y_train, return_train_score=True, scoring="f1")).agg(['mean', 'std']).round(3).T
cross_val_results


{'dummy':               mean    std
 fit_time     0.012  0.004
 score_time   0.005  0.003
 test_score   0.000  0.000
 train_score  0.000  0.000,
 'logreg':               mean    std
 fit_time     7.118  1.493
 score_time   0.463  0.269
 test_score   0.989  0.002
 train_score  0.993  0.001}

In [None]:
cross_val_results

{'dummy':               mean    std
 fit_time     0.046  0.003
 score_time   0.023  0.002
 test_score   0.825  0.000
 train_score  0.825  0.000,
 'logreg':               mean    std
 fit_time     8.687  1.068
 score_time   0.485  0.028
 test_score   0.996  0.001
 train_score  0.998  0.000}

In [None]:
pipe_lr.fit(X_train, y_train)

In [None]:
len_vocab = len(pipe_lr.named_steps["columntransformer"].named_transformers_["countvectorizer"].get_feature_names_out())