# Restricted Classifiers

Workflow: 10

Goal: Many classifiers are predicting only majority class results. Investigate which algorithms have to be excluded.

In [1]:
import os, sys, time
import pandas as PD
import numpy as NP

import sklearn.impute as IM
import sklearn.preprocessing as PP

import sklearn.linear_model as LM
import sklearn.naive_bayes as NB
import sklearn.neighbors as NN
import sklearn.ensemble as ES
import xgboost as XG

import plotly.graph_objects as GO
import plotly.subplots as SB

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [3]:
os.chdir(os.getenv('PWD'))
sys.path.insert(0, os.getenv('PWD'))
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'settings.py')
import django
django.setup()

'app_proj.settings'

In [4]:
import app_proj.settings as ST
import movies.models.models as MD
import movies.models.analysis as NL

## Target & Features

In [5]:
FEATURE_FILE = os.path.join(ST.BASE_DIR, NL.FEATURE_PATH)
feature_all_df = PD.read_csv(FEATURE_FILE)
feature_all_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17668 entries, 0 to 17667
Columns: 879 entries, Movie_ID to Western
dtypes: float64(5), int64(873), object(1)
memory usage: 118.5+ MB


In [6]:
target_ls = MD.UserVotes.objects.values()
target_df = PD.DataFrame(target_ls).drop(columns=['id'])
target_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 881 entries, 0 to 880
Data columns (total 3 columns):
Movie_ID    881 non-null int64
User        881 non-null object
Vote        881 non-null int64
dtypes: int64(2), object(1)
memory usage: 20.8+ KB


In [7]:
feature_wtarget_df = feature_all_df.loc[feature_all_df['Movie_ID'].isin(target_df['Movie_ID'])==True]
feature_wtarget_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 881 entries, 27 to 17612
Columns: 879 entries, Movie_ID to Western
dtypes: float64(5), int64(873), object(1)
memory usage: 5.9+ MB


In [8]:
feature_topred_df = feature_all_df.loc[feature_all_df['Movie_ID'].isin(target_df['Movie_ID'])==False]
feature_topred_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16787 entries, 0 to 17667
Columns: 879 entries, Movie_ID to Western
dtypes: float64(5), int64(873), object(1)
memory usage: 112.7+ MB


In [9]:
# get numeric features

X_np = NP.array(feature_wtarget_df.drop(columns=['Movie_ID', 'Title']))
y_np = NP.array(target_df['Vote'])

X_np.shape
y_np.shape

(881, 877)

(881,)

In [10]:
imputer = IM.SimpleImputer(missing_values=NP.nan, strategy='mean')
X_impute = imputer.fit_transform(X_np)
X_impute.shape

(881, 877)

In [11]:
scaler = PP.StandardScaler()
X_scale = scaler.fit_transform(X_impute)
X_scale.shape

(881, 877)

In [12]:
X_pred = NP.array(feature_topred_df.drop(columns=['Movie_ID', 'Title']))
X_pred.shape

(16787, 877)

In [13]:
X_pred_impute = imputer.transform(X_pred)
X_pred_scale = scaler.transform(X_pred_impute)
X_pred_scale.shape

(16787, 877)

## Train Algorithms

In [14]:
# logistic regression

logreg = LM.LogisticRegression(solver='liblinear', penalty='l1', C=(1/1e3), multi_class='auto', max_iter=2000)
logreg.fit(X_scale, y_np)

LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=2000,
                   multi_class='auto', n_jobs=None, penalty='l1',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [15]:
predict_logreg_np = logreg.predict(X_pred_scale)
PD.DataFrame(predict_logreg_np)[0].value_counts()

1    16787
Name: 0, dtype: int64

In [16]:
# naive bayes

nbayes = NB.GaussianNB()
nbayes.fit(X_scale, y_np)

GaussianNB(priors=None, var_smoothing=1e-09)

In [17]:
predict_nbayes_np = nbayes.predict(X_pred_scale)
PD.DataFrame(predict_nbayes_np)[0].value_counts()

3    11529
1     2762
2     2496
Name: 0, dtype: int64

In [18]:
# nearest neighbors

neighbor = NN.KNeighborsClassifier()
neighbor.fit(X_scale, y_np)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [19]:
predict_neighbor_np = neighbor.predict(X_pred_scale)
PD.DataFrame(predict_neighbor_np)[0].value_counts()

1    14617
2     1986
3      184
Name: 0, dtype: int64

In [20]:
# random forest

forest = ES.RandomForestClassifier(class_weight='balanced', n_estimators=50)
forest.fit(X_scale, y_np)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=50, n_jobs=None, oob_score=False,
                       random_state=None, verbose=0, warm_start=False)

In [21]:
predict_forest_np = forest.predict(X_pred_scale)
PD.DataFrame(predict_forest_np)[0].value_counts()

1    15993
2      753
3       41
Name: 0, dtype: int64

In [22]:
# support vector machine

svm = LM.SGDClassifier(class_weight='balanced')
svm.fit(X_scale, y_np)

SGDClassifier(alpha=0.0001, average=False, class_weight='balanced',
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [23]:
predict_svm_np = svm.predict(X_pred_scale)
PD.DataFrame(predict_svm_np)[0].value_counts()

1    10649
3     3076
2     3062
Name: 0, dtype: int64

In [24]:
# xgboost

xgb = XG.XGBClassifier(objective='multi:softmax', num_class=3, 
                       booster='gblinear', n_estimators=50, reg_alpha=0.05);
xgb.fit(X_scale, y_np)

XGBClassifier(base_score=0.5, booster='gblinear', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=50, n_jobs=1,
              nthread=None, num_class=3, objective='multi:softprob',
              random_state=0, reg_alpha=0.05, reg_lambda=1, scale_pos_weight=1,
              seed=None, silent=None, subsample=1, verbosity=1)

In [25]:
predict_xgb_np = xgb.predict(X_pred_scale)
PD.DataFrame(predict_xgb_np)[0].value_counts()

1    16787
Name: 0, dtype: int64

## Plot Results

In [26]:
group_logreg = PD.DataFrame(predict_logreg_np)[0].value_counts().sort_index()
group_logreg

1    16787
Name: 0, dtype: int64

In [28]:
group_nbayes = PD.DataFrame(predict_nbayes_np)[0].value_counts().sort_index()
group_nbayes
group_nbayes.index
group_nbayes.values

1     2762
2     2496
3    11529
Name: 0, dtype: int64

Int64Index([1, 2, 3], dtype='int64')

array([ 2762,  2496, 11529])

In [29]:
group_neighbor = PD.DataFrame(predict_neighbor_np)[0].value_counts().sort_index()
group_neighbor

1    14617
2     1986
3      184
Name: 0, dtype: int64

In [30]:
group_forest = PD.DataFrame(predict_forest_np)[0].value_counts().sort_index()
group_forest

1    15993
2      753
3       41
Name: 0, dtype: int64

In [31]:
group_svm = PD.DataFrame(predict_svm_np)[0].value_counts().sort_index()
group_svm 

1    10649
2     3062
3     3076
Name: 0, dtype: int64

In [32]:
group_xgb = PD.DataFrame(predict_xgb_np)[0].value_counts().sort_index()
group_xgb 

1    16787
Name: 0, dtype: int64

In [33]:
InteractiveShell.ast_node_interactivity = 'last'

In [34]:
vote_classes = [1, 2, 3]
color_ls=['crimson', 'seagreen', 'gold']

In [44]:
fig = GO.Figure()

fig.add_trace(
    GO.Bar(x=vote_classes, y=group_logreg.values, marker_color=color_ls))
fig.update_layout(
    title="User Movie Votes",
    xaxis_title="Number of Stars",
    yaxis_title="Movie Count",
    width=900,
    height=500,
    margin=GO.layout.Margin(t=50, r=20, b=50, l=70, pad=0),
    paper_bgcolor="LightSteelBlue",
)
fig.update_xaxes(tickvals=vote_classes, range=[0.4, 3.6])


In [119]:
fig = SB.make_subplots(rows=2, cols=3, vertical_spacing=0.1, 
                       subplot_titles=("Logistic Regression", "Naive Bayes", "Nearest Neighbor",
                                       "Random Forest", "SVM", "XGBoost"))

fig.add_trace(
    GO.Bar(x=vote_classes, y=group_logreg.values, marker_color=color_ls), 
    row=1, col=1)
fig.add_trace(
    GO.Bar(x=vote_classes, y=group_nbayes.values, marker_color=color_ls), 
    row=1, col=2)
fig.add_trace(
    GO.Bar(x=vote_classes, y=group_neighbor.values, marker_color=color_ls), 
    row=1, col=3)
fig.add_trace(
    GO.Bar(x=vote_classes, y=group_forest.values, marker_color=color_ls), 
    row=2, col=1)
fig.add_trace(
    GO.Bar(x=vote_classes, y=group_svm.values, marker_color=color_ls), 
    row=2, col=2)
fig.add_trace(
    GO.Bar(x=vote_classes, y=group_xgb.values, marker_color=color_ls), 
    row=2, col=3)
fig.update_layout(
    title={'text': "Predicted Movie Counts for Each Algorithm", 'x':0.5, 'y':0.99,  
        'xanchor': 'center', 'yanchor': 'top'},
    width=900,
    height=700,
    margin=GO.layout.Margin(t=50, r=20, b=50, l=70, pad=0),
    paper_bgcolor="LightSteelBlue",
    showlegend=False,
)

# extend annotations so they don't override the subplot titles

annotations = [a.to_plotly_json() for a in fig["layout"]["annotations"]]
#annotations.append(dict(x=0.5, y=-0.06, text="User Votes"))
annotations.append({'font': {'size': 16}, 'showarrow': False, 'text': 'User Votes', 
                    'x': 0.5, 'xanchor': 'center', 'xref': 'paper', 
                    'y': -0.06, 'yanchor': 'bottom', 'yref': 'paper'})
#annotations.append(dict(x=-0.05, y=0.4, text="Movies Count", textangle=-90))
annotations.append({'font': {'size': 16}, 'showarrow': False, 'text': 'Predicted Movies Count', 
                    'x': -0.05, 'xanchor': 'center', 'xref': 'paper', 
                    'y': 0.4, 'yanchor': 'bottom', 'yref': 'paper', 'textangle':-90})
fig["layout"]["annotations"] = annotations

fig.update_xaxes(tickvals=vote_classes, range=[0.4, 3.6])
fig.update_yaxes(range=[0, 17000])

In [120]:
MD.Reporter.ConvertFigureToJson(fig)    

'{"data": [{"marker": {"color": ["crimson", "seagreen", "gold"]}, "x": [1, 2, 3], "y": [16787], "type": "bar", "xaxis": "x", "yaxis": "y"}, {"marker": {"color": ["crimson", "seagreen", "gold"]}, "x": [1, 2, 3], "y": [2762, 2496, 11529], "type": "bar", "xaxis": "x2", "yaxis": "y2"}, {"marker": {"color": ["crimson", "seagreen", "gold"]}, "x": [1, 2, 3], "y": [14617, 1986, 184], "type": "bar", "xaxis": "x3", "yaxis": "y3"}, {"marker": {"color": ["crimson", "seagreen", "gold"]}, "x": [1, 2, 3], "y": [15993, 753, 41], "type": "bar", "xaxis": "x4", "yaxis": "y4"}, {"marker": {"color": ["crimson", "seagreen", "gold"]}, "x": [1, 2, 3], "y": [10649, 3062, 3076], "type": "bar", "xaxis": "x5", "yaxis": "y5"}, {"marker": {"color": ["crimson", "seagreen", "gold"]}, "x": [1, 2, 3], "y": [16787], "type": "bar", "xaxis": "x6", "yaxis": "y6"}], "layout": {"annotations": [{"font": {"size": 16}, "showarrow": false, "text": "Logistic Regression", "x": 0.14444444444444446, "xanchor": "center", "xref": "pap