# Libraries

In [27]:
from numpy import mean, std
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc
from sklearn.metrics import make_scorer
from sklearn.dummy import DummyClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RepeatedStratifiedKFold, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
from sklearn import tree
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.inspection import permutation_importance
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler,PowerTransformer

## Load data and resample

In [38]:
# load the dataset
def load_dataset(file_path):
    # load the dataset as a dataframe
    data_df = pd.read_pickle(file_path)
    # correct data types
    fix_cols = data_df.columns[data_df.dtypes == 'object']
    data_df.loc[:, fix_cols] = data_df.loc[:, fix_cols].astype('bool')
    # remove redundant features
    data_df.drop(['total_review_count', 'halal_review_count', 'halal_negation_count'], inplace=True, axis=1)
    # split into features and target
    X, y = data_df.drop('halal', axis=1), data_df.halal
    return X, y

# define resampling method
def split_and_resample(X, y, test_size=0, resampling=None):
    if test_size > 0:
        # setting up testing and training sets
        x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
    else:
        x_train, x_test, y_train, y_test = X, pd.DataFrame(columns=X.columns, dtype=X.dtypes.values),\
                                            y, pd.Series(name=y.name, dtype=y.dtypes)
    # concatenate our training data back together
    X = pd.concat([x_train, y_train], axis=1)
    # separate minority and majority classes
    min_class = X[X.halal==X.halal.value_counts().sort_values().index[0]]
    max_class = X[X.halal==X.halal.value_counts().sort_values().index[1]]
    if not resampling:
        return x_train, y_train, x_test, y_test
    # oversampling minority
    if resampling == 'oversample':
        new_min_class = resample(min_class, replace=True, n_samples=(len(max_class)))
        upsampled = pd.concat([max_class, new_min_class])
        return upsampled.drop('halal', axis=1), upsampled.halal, x_test, y_test
    elif resampling == 'undersample':
        new_max_class = resample(max_class, replace=True, n_samples=(len(min_class)))
        downsampled = pd.concat([new_max_class, min_class])
        return downsampled.drop('halal', axis=1), downsampled.halal, x_test, y_test
    elif resampling == 'SMOTE':
        x_train, y_train = SMOTE().fit_sample(x_train, y_train)
        return x_train, y_train, x_test, y_test

# evaluate a model
def evaluate_model(X, y, model, n_splits=10):
    # define evaluation procedure
    cv = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=3)
    # define scoring metrics
    metrics = {'acc': 'accuracy', 'pre': 'precision', 'rec': 'recall', 'f1': 'f1_macro'}
    # evaluate model
    scores = cross_validate(model, X, y, scoring=metrics, cv=cv)
    return scores

# calculate precision-recall area under curve
def pr_auc(y_true, probas_pred):
   # calculate precision-recall curve
   p, r, _ = precision_recall_curve(y_true, probas_pred)
   # calculate area under curve
   return auc(r, p)

# evaluate a model
def pr_auc_score(X, y, model):
    # define evaluation procedure
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3)
    # define the model evaluation the metric
    metric = make_scorer(pr_auc, needs_proba=True)
    # evaluate model
    scores = cross_val_score(model, X, y, scoring=metric, cv=cv, error_score='raise')
    return scores

# evaluate a model
def cross_validate_f1_macroe(X, y, model, n_splits=10):
    # define evaluation procedure
    cv = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=3)
    # evaluate model
    scores = cross_validate(model, X, y, scoring='f1_macro', cv=cv)
    return scores

## Logistic Regression Classifier

- The average of percent of halal reviews, count of relevant halal reviews, and count of mentions of halal bacon is higher for halal resaurants than non-halal restaurants as expected.
- The average of non-halal relevant reviews and percent of reviews that include halal-negation is higher in non-halal restaurants as expected.
- The count of halal-truck mentions doesn't seem significantly different between the two groups. This feature randked 7th in RF so there could be information that's not reflected by the groups averages in there.

In [3]:
def start_logistic_reg():
    # define the location of the dataset
    file_path = '/Users/wesamazaizeh/Desktop/Projects/halal_o_meter/src/features/feature_engineering/restaurant_cat_and_num_v3.pkl'
    # load the dataset
    X, y = load_dataset(file_path)
    # transform bool target to 1/0
    y = y*1
    # select numerical columns
    X_keep = X.columns[X.dtypes == 'float64']
    X_num = X.loc[:, X_keep]
    # class observations
    data = pd.concat([X_num, y], axis=1)
    # create dummy features from categorial features
    cat_cols = X.columns[X.dtypes == 'bool']
    dummy_X = pd.concat([pd.get_dummies(X[col], prefix=col) for col in cat_cols], axis=1)
    # drop categorial features and replace with dummy features
    X = pd.concat([X, dummy_X], axis=1)
    X.drop(cat_cols, axis=1, inplace=True)
    # split to train and test
    X_train, y_train, X_test, y_test = split_and_resample(X, y, test_size=0.2)
    
    return X_train, y_train, X_test, y_test

In [18]:
# load data, convert to proper format and split to train and test
X_train, y_train, X_test, y_test = start_logistic_reg()

# define default logistic regressor
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

# evaluate default logistic regressor
cv_scores = evaluate_model(X_train, y_train, logreg)
cv_scores
print('Accuracy: {:.3f} +/- {:.3f}'.format(cv_scores['test_acc'].mean(), cv_scores['test_acc'].std()))
print('Precision: {:.3f} +/- {:.3f}'.format(cv_scores['test_pre'].mean(), cv_scores['test_pre'].std()))
print('Recall: {:.3f} +/- {:.3f}'.format(cv_scores['test_rec'].mean(), cv_scores['test_rec'].std()))
print('f1: {:.3f} +/- {:.3f}'.format(cv_scores['test_f1'].mean(), cv_scores['test_f1'].std()))
print('Confusion matrix:\n', confusion_matrix(y_test, y_pred))
print('Classification report:\n', classification_report(y_test, y_pred))

Accuracy: 0.763 +/- 0.039
Precision: 0.799 +/- 0.032
Recall: 0.904 +/- 0.042
f1: 0.652 +/- 0.068
Confusion matrix:
 [[ 20  24]
 [  7 117]]
Classification report:
               precision    recall  f1-score   support

           0       0.74      0.45      0.56        44
           1       0.83      0.94      0.88       124

    accuracy                           0.82       168
   macro avg       0.79      0.70      0.72       168
weighted avg       0.81      0.82      0.80       168



# Add imbalance term (class_weight)

In [20]:
# load data, convert to proper format and split to train and test
X_train, y_train, X_test, y_test = start_logistic_reg()

# define class weights
w = {0:26, 1:74}

# define custom logistic regression
logreg2 = LogisticRegression(class_weight=w, max_iter=1000)
logreg2.fit(X_train, y_train)


# predict and print performance summary
y_pred = logreg2.predict(X_test)
PR_auc = pr_auc_score(X_test, y_test, logreg2)
print('Train set PR-AUC : {:.3f} +/- {:.3f}'.format(PR_auc.mean(), PR_auc.std()))
print('Accuracy : {:.3f}'.format(accuracy_score(y_test, y_pred)))
print('Precision : {:.3f}'.format(precision_score(y_test, y_pred)))
print('Recall : {:.3f}'.format(recall_score(y_test, y_pred)))
print('f1 : {:.3f}'.format(f1_score(y_test, y_pred, average='macro')))
print('Cofusion matrix:\n', confusion_matrix(y_test, y_pred))
print('Classification report:\n', classification_report(y_test, y_pred))

Train set PR-AUC : 0.873 +/- 0.084
Accuracy : 0.744
Precision : 0.737
Recall : 0.983
f1 : 0.580
Cofusion matrix:
 [[ 10  41]
 [  2 115]]
Classification report:
               precision    recall  f1-score   support

           0       0.83      0.20      0.32        51
           1       0.74      0.98      0.84       117

    accuracy                           0.74       168
   macro avg       0.79      0.59      0.58       168
weighted avg       0.77      0.74      0.68       168



# Test interaction terms

In [47]:
# load data, convert to proper format and split to train and test
X_train, y_train, X_test, y_test = start_logistic_reg()

# define default logistic regressor
logreg = LogisticRegression(max_iter=1000)

# baseline f1_macro score for comparison
baseline_scores = cross_validate_f1_macroe(X_train, y_train, logreg, n_splits=3)['test_score']
baseline = np.mean(baseline_scores)

interactions = []
c=0
for feature_A in X_train.columns:
    for feature_B in X_train.columns:
        if feature_A > feature_B:
            X_train['interaction'] = X_train[feature_A] * X_train[feature_B]
            score = np.mean(cross_validate_f1_macroe(X_train, y_train, logreg, n_splits=3)['test_score'])
            if score > baseline:
                interactions.append((feature_A, feature_B, round(score,3)))
    print('[{0}/{1}]'.format(c, X_train.shape[1]), end='\r', flush=True)
    c+=1
print('Baseline f1: {:.3f} +/- ({:.3f})'.format(baseline, np.std(baseline_scores)))
print('Top 10 interactions: %s' % sorted(interactions, key=lambda x: x[2], reverse=True)[:10])

Baseline f1: 0.651 +/- (0.036)
Top 10 interactions: [('halal_negation_percent', 'halal_burger_False', 0.672), ('halal_relevant_count', 'halal_lamb_True', 0.671), ('is_halal_count', 'halal_relevant_True', 0.67), ('halal_lamb_count', 'halal_in_name_False', 0.669), ('non_halal_relevant_False', 'halal_beef_False', 0.668), ('halal_lamb_False', 'halal_chicken_True', 0.668), ('halal_lamb_True', 'halal_burger_True', 0.668), ('is_halal_False', 'halal_meat_True', 0.668), ('is_halal_True', 'halal_goat_True', 0.668), ('partial_halal_False', 'is_halal_False', 0.668)]


### There doesn't seem to be a significant improvement from adding interaction terms