In [47]:
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, roc_curve, roc_auc_score, classification_report, log_loss, make_scorer
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier

In [48]:
df = pd.read_csv('grad.csv')

In [49]:
df.head()

Unnamed: 0,admit,gre,gpa,rank
0,0,380,3.61,3
1,1,660,3.67,3
2,1,800,4.0,1
3,1,640,3.19,4
4,0,520,2.93,4


In [50]:
y = df.pop('admit')
X = df

In [51]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=21)
scalar = StandardScaler().fit(X_train)
X_train = scalar.transform(X_train)
X_test = scalar.transform(X_test)


  return self.partial_fit(X, y)
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


#### Logistic Regression

In [52]:
params = {
          'penalty': ['l2'], 
          'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
          'fit_intercept': [True, False],
          'C': np.logspace(-3,3)
         }
gs = GridSearchCV(LogisticRegression(), param_grid=params, scoring='neg_log_loss', cv=10, n_jobs=5).fit(X_train, y_train)
gs.best_params_




{'C': 0.0517947467923121,
 'fit_intercept': True,
 'penalty': 'l2',
 'solver': 'lbfgs'}

In [53]:
model = LogisticRegression(**gs.best_params_).fit(X_train, y_train)
y_hat = model.predict(X_test)
p_hat = model.predict_proba(X_test)[:,1]

In [54]:
baseline = 1 - (y_test.sum() / y_test.size)
accuracy = accuracy_score(y_test, y_hat)
baseline, accuracy

(0.7424242424242424, 0.75)

#### Random Forest

In [55]:
params = {
            'n_estimators': [200],
            'criterion': ['gini', 'entropy'],
            'max_depth': range(1, 15),
            'max_features': ['sqrt', 'log2']
         }
gs = GridSearchCV(RandomForestClassifier(), param_grid=params, scoring='neg_log_loss', cv=5, n_jobs=-1).fit(X_train, y_train)
gs.best_params_




{'criterion': 'entropy',
 'max_depth': 3,
 'max_features': 'log2',
 'n_estimators': 200}

In [56]:
rfc = RandomForestClassifier(**gs.best_params_).fit(X_train, y_train)
yhat = rfc.predict(X_test)
phat = rfc.predict_proba(X_test)[:,1]
accuracy_score(y_test, yhat), precision_score(y_test, yhat), recall_score(y_test, yhat), roc_auc_score(y_test, phat)

(0.7651515151515151,
 0.6666666666666666,
 0.17647058823529413,
 0.6659663865546219)

#### Naive Bayes

In [57]:
GaussianNB().fit(X_train,y_train).score(X_test,y_test)

0.7954545454545454

#### Conclusion: Naive Bayes with Gaussian has the best score of all three compared.