# Analysis: Logistic Regression

Workflow: 8 

Goal: Load features and target, and run classification.

Result:

In [1]:
import os, sys, time
import pandas as PD
import numpy as NP

import sklearn.impute as IM
import sklearn.preprocessing as PP
import sklearn.decomposition as DC
import imblearn.pipeline as PL

import sklearn.linear_model as LM
import sklearn.model_selection as MS

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [3]:
os.chdir(os.getenv('PWD'))
sys.path.insert(0, os.getenv('PWD'))
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'settings.py')
import django
django.setup()

'app_proj.settings'

In [4]:
! pwd

/Users/Phil/Documents/Websites/Movies_Proj/Filmophile/backend


In [5]:
import app_proj.settings as ST
import movies.models.models as MD
import movies.models.analysis as NL

## Target & Features

In [6]:
FEATURE_FILE = os.path.join(ST.BASE_DIR, NL.FEATURE_PATH)
feature_full_df = PD.read_csv(FEATURE_FILE)
feature_full_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17668 entries, 0 to 17667
Columns: 625 entries, Movie_ID to Western
dtypes: float64(5), int64(619), object(1)
memory usage: 84.2+ MB


In [7]:
target_ls = MD.UserVotes.objects.values()
target_full_df = PD.DataFrame(target_ls).drop(columns=['id'])
target_full_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 3 columns):
Movie_ID    600 non-null int64
User        600 non-null object
Vote        600 non-null int64
dtypes: int64(2), object(1)
memory usage: 14.2+ KB


In [8]:
full_df = PD.merge(target_full_df, feature_full_df, how='left', left_on='Movie_ID', right_on='Movie_ID')
full_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 600 entries, 0 to 599
Columns: 627 entries, Movie_ID to Western
dtypes: float64(5), int64(620), object(2)
memory usage: 2.9+ MB


In [9]:
# keep only features for movies that have been voted on

feature_df = full_df.drop(columns=['Movie_ID', 'Title', 'User', 'Vote'])
feature_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 600 entries, 0 to 599
Columns: 623 entries, Year to Western
dtypes: float64(5), int64(618)
memory usage: 2.9 MB


In [10]:
target_df = target_full_df.drop(columns=['Movie_ID', 'User'])
target_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 1 columns):
Vote    600 non-null int64
dtypes: int64(1)
memory usage: 4.8 KB


## Run Logistic Regression

In [11]:
MAX_ITER = 2000
ALPHA = NP.array([1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4])
algorithms = []

In [12]:
X_np = NP.array(feature_df)
y_np = NP.array(target_df['Vote'])

X_np.shape
y_np.shape

(600, 623)

(600,)

In [13]:
InteractiveShell.ast_node_interactivity = 'last'

In [14]:
# setup the grid search

imputer = IM.SimpleImputer(missing_values=NP.nan, strategy='mean')
scaler = PP.StandardScaler()
#pca = DC.PCA(n_components=)
logreg = LM.LogisticRegression(solver='liblinear', penalty='l1', multi_class='auto', max_iter=MAX_ITER)
pipeline = PL.Pipeline([('imputer', imputer), ('scaler', scaler), ('estimator', logreg)])

param_dx = {
    'estimator__C': 1 / ALPHA,
    }

searcher = MS.GridSearchCV(pipeline, param_dx, scoring='accuracy', cv=5)

In [15]:
# run the grid search

t0 = time.time()
searcher.fit(X_np, y_np);

t1 = time.time()
print(f'time: {(t1-t0)/60:.2f} mins')

time: 0.55 mins


In [16]:
# check the best parameters found by grid search

best_parameters = searcher.best_params_
best_score = searcher.best_score_

print("Best Parameters: ")
for param_name in sorted(best_parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))

print("")
print("Best CV-Test Accuracy: {:.1f}%".format(best_score * 100))

Best Parameters: 
estimator__C: 0.01

Best CV-Test Accuracy: 63.8%


In [17]:
new_dx = {
    'algorithm': 'liblinear-L1',
    'test_scores': list(searcher.cv_results_['mean_test_score']),
}
algorithms.append(new_dx)
new_dx

{'algorithm': 'liblinear-L1',
 'test_scores': [0.545,
  0.5466666666666666,
  0.545,
  0.5633333333333334,
  0.6266666666666667,
  0.6383333333333333,
  0.6383333333333333,
  0.6383333333333333]}

In [18]:
# liblinear with L2

imputer = IM.SimpleImputer(missing_values=NP.nan, strategy='mean')
scaler = PP.StandardScaler()
#pca = DC.PCA(n_components=)
logreg = LM.LogisticRegression(solver='liblinear', penalty='l2', multi_class='auto', max_iter=MAX_ITER)
pipeline = PL.Pipeline([('imputer', imputer), ('scaler', scaler), ('estimator', logreg)])

param_dx = {
    'estimator__C': 1 / ALPHA,
    }

searcher = MS.GridSearchCV(pipeline, param_dx, scoring='accuracy', cv=5)

t0 = time.time()
searcher.fit(X_np, y_np);

t1 = time.time()
print(f'time: {(t1-t0)/60:.2f} mins')

time: 0.08 mins


In [19]:
new_dx = {
    'algorithm': 'liblinear-L2',
    'test_scores': list(searcher.cv_results_['mean_test_score']),
}
algorithms.append(new_dx)
new_dx

{'algorithm': 'liblinear-L2',
 'test_scores': [0.5433333333333333,
  0.545,
  0.54,
  0.5416666666666666,
  0.5433333333333333,
  0.5533333333333333,
  0.545,
  0.5333333333333333]}

In [20]:
# saga with L1

imputer = IM.SimpleImputer(missing_values=NP.nan, strategy='mean')
scaler = PP.StandardScaler()
#pca = DC.PCA(n_components=)
logreg = LM.LogisticRegression(solver='saga', penalty='l1', multi_class='auto', max_iter=MAX_ITER)
pipeline = PL.Pipeline([('imputer', imputer), ('scaler', scaler), ('estimator', logreg)])

param_dx = {
    'estimator__C': 1 / ALPHA,
    }

searcher = MS.GridSearchCV(pipeline, param_dx, scoring='accuracy', cv=5)

t0 = time.time()
searcher.fit(X_np, y_np);

t1 = time.time()
print(f'time: {(t1-t0)/60:.2f} mins')



time: 8.41 mins


In [21]:
new_dx = {
    'algorithm': 'saga-L1',
    'test_scores': list(searcher.cv_results_['mean_test_score']),
}
algorithms.append(new_dx)
new_dx

{'algorithm': 'saga-L1',
 'test_scores': [0.5333333333333333,
  0.545,
  0.55,
  0.56,
  0.6283333333333333,
  0.6383333333333333,
  0.6383333333333333,
  0.6383333333333333]}

In [22]:
# saga with L2

imputer = IM.SimpleImputer(missing_values=NP.nan, strategy='mean')
scaler = PP.StandardScaler()
#pca = DC.PCA(n_components=)
logreg = LM.LogisticRegression(solver='saga', penalty='l2', multi_class='auto', max_iter=MAX_ITER)
pipeline = PL.Pipeline([('imputer', imputer), ('scaler', scaler), ('estimator', logreg)])

param_dx = {
    'estimator__C': 1 / ALPHA,
    }

searcher = MS.GridSearchCV(pipeline, param_dx, scoring='accuracy', cv=5)

t0 = time.time()
searcher.fit(X_np, y_np);

t1 = time.time()
print(f'time: {(t1-t0)/60:.2f} mins')



time: 5.50 mins


In [23]:
new_dx = {
    'algorithm': 'saga-L2',
    'test_scores': list(searcher.cv_results_['mean_test_score']),
}
algorithms.append(new_dx)
new_dx

{'algorithm': 'saga-L2',
 'test_scores': [0.5333333333333333,
  0.535,
  0.54,
  0.5466666666666666,
  0.55,
  0.6083333333333333,
  0.6383333333333333,
  0.6383333333333333]}

In [24]:
algorithms

[{'algorithm': 'liblinear-L1',
  'test_scores': [0.545,
   0.5466666666666666,
   0.545,
   0.5633333333333334,
   0.6266666666666667,
   0.6383333333333333,
   0.6383333333333333,
   0.6383333333333333]},
 {'algorithm': 'liblinear-L2',
  'test_scores': [0.5433333333333333,
   0.545,
   0.54,
   0.5416666666666666,
   0.5433333333333333,
   0.5533333333333333,
   0.545,
   0.5333333333333333]},
 {'algorithm': 'saga-L1',
  'test_scores': [0.5333333333333333,
   0.545,
   0.55,
   0.56,
   0.6283333333333333,
   0.6383333333333333,
   0.6383333333333333,
   0.6383333333333333]},
 {'algorithm': 'saga-L2',
  'test_scores': [0.5333333333333333,
   0.535,
   0.54,
   0.5466666666666666,
   0.55,
   0.6083333333333333,
   0.6383333333333333,
   0.6383333333333333]}]

In [34]:
import plotly.graph_objects as GO

fig = GO.Figure()
colors = ['orange', 'red', 'darkcyan', 'green']

for idx, alg in enumerate(algorithms):
    fig.add_trace(
        GO.Scatter(x = ALPHA, y = alg['test_scores'],
            name=alg['algorithm'], marker={'color': colors[idx]}, mode='lines+markers'))

fig.update_layout(
    title="Logistic Regression CV-Test Scores",
    xaxis_title="lambda",
    yaxis_title="Accuracy",
    width=600,
    height=400,
    margin=GO.layout.Margin(t=50, r=10, b=50, l=70, pad=0),
    paper_bgcolor="LightSteelBlue",
)
fig.update_xaxes(tickvals=ALPHA, type="log")
fig.update_yaxes(range=[0.52, 0.66])