# Libraries

In [1]:
from numpy import mean, std
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc
from sklearn.metrics import make_scorer
from sklearn.dummy import DummyClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RepeatedStratifiedKFold, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
from sklearn import tree
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.inspection import permutation_importance
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler,PowerTransformer

## Load data and resample

In [16]:
# load the dataset
def load_dataset(file_path):
    # load the dataset as a dataframe
    data_df = pd.read_pickle(file_path)
    # correct data types
    fix_cols = data_df.columns[data_df.dtypes == 'object']
    data_df.loc[:, fix_cols] = data_df.loc[:, fix_cols].astype('bool')
    # remove redundant features
    data_df.drop(['total_review_count', 'halal_review_count', 'halal_negation_count'], inplace=True, axis=1)
    # split into features and target
    X, y = data_df.drop('halal', axis=1), data_df.halal
    return X, y

# define resampling method
def split_and_resample(X, y, test_size=0, resampling=None):
    if test_size > 0:
        # setting up testing and training sets
        x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
    else:
        x_train, x_test, y_train, y_test = X, pd.DataFrame(columns=X.columns, dtype=X.dtypes.values),\
                                            y, pd.Series(name=y.name, dtype=y.dtypes)
    # concatenate our training data back together
    X = pd.concat([x_train, y_train], axis=1)
    # separate minority and majority classes
    min_class = X[X.halal==X.halal.value_counts().sort_values().index[0]]
    max_class = X[X.halal==X.halal.value_counts().sort_values().index[1]]
    if not resampling:
        return x_train, y_train, x_test, y_test
    # oversampling minority
    if resampling == 'oversample':
        new_min_class = resample(min_class, replace=True, n_samples=(len(max_class)))
        upsampled = pd.concat([max_class, new_min_class])
        return upsampled.drop('halal', axis=1), upsampled.halal, x_test, y_test
    elif resampling == 'undersample':
        new_max_class = resample(max_class, replace=True, n_samples=(len(min_class)))
        downsampled = pd.concat([new_max_class, min_class])
        return downsampled.drop('halal', axis=1), downsampled.halal, x_test, y_test
    elif resampling == 'SMOTE':
        x_train, y_train = SMOTE().fit_sample(x_train, y_train)
        return x_train, y_train, x_test, y_test

# evaluate a model
def evaluate_model(X, y, model):
    # define evaluation procedure
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3)
    # define scoring metrics
    metrics = {'roc_auc': 'roc_auc', 'acc': 'accuracy', 'rec': 'recall'}
    # evaluate model
    scores = cross_validate(model, X, y, scoring=metrics, cv=cv)
    return scores

# calculate precision-recall area under curve
def pr_auc(y_true, probas_pred):
   # calculate precision-recall curve
   p, r, _ = precision_recall_curve(y_true, probas_pred)
   # calculate area under curve
   return auc(r, p)

# evaluate a model
def pr_auc_score(X, y, model):
    # define evaluation procedure
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3)
    # define the model evaluation the metric
    metric = make_scorer(pr_auc, needs_proba=True)
    # evaluate model
    scores = cross_val_score(model, X, y, scoring=metric, cv=cv, error_score='raise')
    return scores

## Logistic Regression Classifier

In [4]:
def start_logistic_reg():
    # define the location of the dataset
    file_path = '/Users/wesamazaizeh/Desktop/Projects/halal_o_meter/src/features/feature_engineering/restaurant_cat_and_num.pkl'

    # load the dataset
    X, y = load_dataset(file_path)

    # transform bool target to 1/0
    y = y*1

    # select numerical columns
    X_keep = X.columns[X.dtypes == 'float64']
    X_num = X.loc[:, X_keep]

    # class observations
    data = pd.concat([X_num, y], axis=1)
    print(data.groupby('halal').mean())

    # create dummy features from categorial features
    cat_cols = X.columns[X.dtypes == 'bool']
    dummy_X = pd.concat([pd.get_dummies(X[col], prefix=col) for col in cat_cols], axis=1)

    # drop categorial features and replace with dummy features
    X = pd.concat([X, dummy_X], axis=1)
    X.drop(cat_cols, axis=1, inplace=True)

- The average of percent of halal reviews, count of relevant halal reviews, and count of mentions of halal bacon is higher for halal resaurants than non-halal restaurants as expected.
- The average of non-halal relevant reviews and percent of reviews that include halal-negation is higher in non-halal restaurants as expected.
- The count of halal-truck mentions doesn't seem significantly different between the two groups. This feature randked 7th in RF so there could be information that's not reflected by the groups averages in there.

In [13]:
def start_logistic_reg():
    # define the location of the dataset
    file_path = '/Users/wesamazaizeh/Desktop/Projects/halal_o_meter/src/features/feature_engineering/restaurant_cat_and_num.pkl'
    # load the dataset
    X, y = load_dataset(file_path)
    # transform bool target to 1/0
    y = y*1
    # select numerical columns
    X_keep = X.columns[X.dtypes == 'float64']
    X_num = X.loc[:, X_keep]
    # class observations
    data = pd.concat([X_num, y], axis=1)
    # create dummy features from categorial features
    cat_cols = X.columns[X.dtypes == 'bool']
    dummy_X = pd.concat([pd.get_dummies(X[col], prefix=col) for col in cat_cols], axis=1)
    # drop categorial features and replace with dummy features
    X = pd.concat([X, dummy_X], axis=1)
    X.drop(cat_cols, axis=1, inplace=True)
    # split to train and test
    X_train, y_train, X_test, y_test = split_and_resample(X, y, test_size=0.2)
    
    return X_train, y_train, X_test, y_test

In [17]:
# load data, convert to proper format and split to train and test
X_train, y_train, X_test, y_test = start_logistic_reg()

# define default logistic regressor
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

# evaluate default logistic regressor
cv_scores = evaluate_model(X_train, y_train, logreg)
print('AUC-ROC: {:.3f} +/- {:.3f}'.format(cv_scores['test_roc_auc'].mean(), cv_scores['test_roc_auc'].std()))
print('Accuracy: {:.3f} +/- {:.3f}'.format(cv_scores['test_acc'].mean(), cv_scores['test_acc'].std()))
print('Recall: {:.3f} +/- {:.3f}'.format(cv_scores['test_rec'].mean(), cv_scores['test_rec'].std()))
print('Confusion matrix:\n', confusion_matrix(y_test, y_pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


AUC-ROC: 0.813 +/- 0.059
Accuracy: 0.791 +/- 0.038
Recall: 0.928 +/- 0.037
Confusion matrix:
 [[ 15  36]
 [ 15 102]]


### Add class_weight to account for class imbalance

In [19]:
# load data, convert to proper format and split to train and test
X_train, y_train, X_test, y_test = start_logistic_reg()

# define balanced logistic regressor
logreg_bal = LogisticRegression(class_weight='balanced')
logreg_bal.fit(X_train, y_train)
y_pred = logreg_bal.predict(X_test)

# evaluate balanced model
cv_scores = evaluate_model(X_train, y_train, logreg_bal)
print('AUC-ROC: {:.3f} +/- {:.3f}'.format(cv_scores['test_roc_auc'].mean(), cv_scores['test_roc_auc'].std()))
print('Accuracy: {:.3f} +/- {:.3f}'.format(cv_scores['test_acc'].mean(), cv_scores['test_acc'].std()))
print('Recall: {:.3f} +/- {:.3f}'.format(cv_scores['test_rec'].mean(), cv_scores['test_rec'].std()))
print('Confusion matrix:\n', confusion_matrix(y_test, y_pred))

AUC-ROC: 0.807 +/- 0.049
Accuracy: 0.688 +/- 0.045
Recall: 0.654 +/- 0.072
Confusion matrix:
 [[41  7]
 [43 77]]


In [23]:
# load data, convert to proper format and split to train and test
X_train, y_train, X_test, y_test = start_logistic_reg()

# define class weights
w = {0:26, 1:74}

# define custom logistic regression
logreg2 = LogisticRegression(class_weight=w, max_iter=1000)
logreg2.fit(X_train, y_train)


# predict and print performance summary
y_pred = logreg2.predict(X_test)
PR_auc = pr_auc_score(X_test, y_test, logreg2)
print('Train set PR-AUC : {:.3f} +/- {:.3f}'.format(PR_auc.mean(), PR_auc.std()))
print('Accuracy : {:.3f}'.format(accuracy_score(y_test, y_pred)))
print('Recall : {:.3f}'.format(recall_score(y_test, y_pred)))
print('f1 : {:.3f}'.format(f1_score(y_test, y_pred)))
print('Cofusion matrix:\n', confusion_matrix(y_test, y_pred))
print('Classification report:\n', classification_report(y_test, y_pred))

Train set PR-AUC : 0.934 +/- 0.036
Accuracy : 0.750
Recall : 0.976
f1 : 0.852
Cofusion matrix:
 [[  5  39]
 [  3 121]]
Classification report:
               precision    recall  f1-score   support

           0       0.62      0.11      0.19        44
           1       0.76      0.98      0.85       124

    accuracy                           0.75       168
   macro avg       0.69      0.54      0.52       168
weighted avg       0.72      0.75      0.68       168



In [10]:
# Create a scaler object
sc = StandardScaler()

# Fit the scaler to the training data and transform
X_train_std = sc.fit_transform(X_train)

# Apply the scaler to the test data
X_test_std = sc.transform(X_test)

# lasso logistic regression
C = [10, 1, .1, .001]

for c in C:
    model = LogisticRegression(penalty='l1', C=c, solver='liblinear')
    model.fit(X_train, y_train)
    print('C:', c)
#     print('Coefficient of each feature:', clf.coef_)
    print('Training accuracy:', model.score(X_train_std, y_train))
    print('Test accuracy:', model.score(X_test_std, y_test))
    print('')

C: 10
Training accuracy: 0.4962630792227205
Test accuracy: 0.47023809523809523

C: 1
Training accuracy: 0.5052316890881914
Test accuracy: 0.48214285714285715

C: 0.1
Training accuracy: 0.6098654708520179
Test accuracy: 0.6488095238095238

C: 0.001
Training accuracy: 0.2750373692077728
Test accuracy: 0.24404761904761904

