# Logistic Regression

The following code trains a logistic regression model.

In [None]:
# Config
from pi_pact_sort import categorize, bin_categorize

FEATURES: list = ['RSSI', 'HUMIDITY', 'PRESSURE'] # Contains strings 'RSSI', 'HUDMITIY', and/or 'PRESSURE'
CATEGORIZE_FUNC = bin_categorize # One of the two binning functions in pi_pact_sort.py

# Automatically configure other variables
if len(FEATURES) > 1:
    if len(FEATURES) == 2:
        if 'HUMIDITY' in FEATURES:
            feature_str = '3varH'
        else:
            feature_str = '3varP'
    else:
        feature_str = '4var'
else:
    feature_str = '2var'

if CATEGORIZE_FUNC == categorize:
    label_str = '3b'
else:
    label_str = 'binary'

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegressionCV
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures

DROP_COLUMNS = ['ADDRESS', 'TIMESTAMP', 'UUID', 'MAJOR', 'MINOR', 'TX POWER', 'TEMPERATURE',  
                'PITCH', 'ROLL', 'YAW', 'SCAN']
for feature in ['RSSI', 'HUMIDITY', 'PRESSURE']:
    if feature not in FEATURES:
        DROP_COLUMNS.append(feature)
SAMPLE_SIZE = 30000
np.random.seed(0)


"""Trains a Logistic Regression classifier to predict a distance range given RSSI values and other variables.
"""

# Initialize DataFrame
data: pd.DataFrame = pd.DataFrame(columns=['DISTANCE', ] + FEATURES)
data_copy: pd.DataFrame = data.copy()
csv_file: Path
for csv_file in Path('.').glob('indoor-noObstruct-SenseHat*/*.csv'):
    datapart: pd.DataFrame = pd.read_csv(csv_file)
    for column in DROP_COLUMNS:
        if column in datapart.columns:
            datapart = datapart.drop([column], 1)
    data_copy = data_copy.append(datapart)

# Categorize distance
data_copy['DISTANCE'] = data_copy['DISTANCE'].map(CATEGORIZE_FUNC)

# Sample data from each distance category
for value in data_copy['DISTANCE'].unique():
    datapart = data_copy[data_copy.DISTANCE == value]
    datapart = datapart.sample(SAMPLE_SIZE, random_state=1)
    data = data.append(datapart)

# Assign features and labels
X: np.array = data.drop(['DISTANCE'], 1).to_numpy(dtype='float32')
y: np.array = data['DISTANCE'].to_numpy(dtype=int)

# Preprocessing
steps = [('min_max', MinMaxScaler()),
         ('interactions', PolynomialFeatures(interaction_only=True,
                                             include_bias=False))]
if len(FEATURES) > 1:
    steps.append(('dim_reduce', PCA()))

In [None]:
# Tune dimensionality reduction if necessary
C_list = [i/10000. for i in range(1, 100)]
if len(FEATURES) > 1:
    best_score = 0
    for i in range(2, 2 ** len(FEATURES) - 1):
        steps[2] = ('dim_reduce', PCA(n_components=i))
        preprocessing = Pipeline(steps=steps)
        clf = make_pipeline(preprocessing, LogisticRegressionCV(Cs=C_list,
                           multi_class='multinomial',
                           n_jobs=2))
        clf.fit(X, y)
        if clf.score(X, y) > best_score:
            best_score = clf.score(X, y)
            best_clf = clf
else:
    preprocessing = Pipeline(steps=steps)
    best_clf = make_pipeline(preprocessing, LogisticRegressionCV(Cs=C_list,
                           multi_class='multinomial',
                           n_jobs=2))
    best_clf.fit(X, y)


In [None]:
# Print optimal dimensionality
if len(FEATURES) > 1:
    print('n_components=',
          best_clf.named_steps['pipeline'].named_steps['dim_reduce'].n_components)

In [None]:
# Print optimal C value and accuracy
print('C=', best_clf.named_steps['logisticregressioncv'].C_)
print('accuracy =', best_clf.score(X, y))

In [None]:
# Pickle model
import pickle
with open(f"linear-models/{feature_str}-{label_str}-polyFeatures-dimReduce-linear-model.pickle", "wb") as f:
    pickle.dump(best_clf.named_steps['logisticregressioncv'], f)

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve

# Plot ROC curve if binary classification
if CATEGORIZE_FUNC == bin_categorize:
    probs = best_clf.decision_function(X)
    fpr, tpr, _ = roc_curve(y, probs)
    plt.plot(fpr, tpr)
    plt.title('ROC curve')
    plt.xlabel('false positive rate')
    plt.ylabel('true positive rate')
    plt.xlim(0,)
    plt.ylim(0,)
    plt.savefig(str(Path(f'linear-models/{feature_str}-{label_str}-polyFeatures-dimReduce-linear-model-roc-curve.png')))