In [1]:
import numpy as np
import pandas as pd
import altair as alt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score


In [2]:
cheese = pd.read_csv('data/cheese_data.csv')
cheese.shape

(1042, 13)

In [3]:
cheese = cheese[[ 'MoisturePercent', 'FatLevel']]
cheese.head(15)

Unnamed: 0,MoisturePercent,FatLevel
0,47.0,lower fat
1,47.9,lower fat
2,54.0,lower fat
3,47.0,lower fat
4,49.4,lower fat
5,48.0,lower fat
6,52.0,lower fat
7,41.0,lower fat
8,50.0,lower fat
9,55.0,lower fat


In [4]:
cheese['FatLevel'] = cheese['FatLevel'].replace('lower fat', 0)
cheese['FatLevel'] = cheese['FatLevel'].replace('higher fat', 1)
cheese.dropna(subset=['MoisturePercent'], inplace = True)


### train_test_split method

In [5]:
X = cheese.drop(columns = 'FatLevel')
y = cheese[[ 'FatLevel']]
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state = 123)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)


In [6]:
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train.values.ravel())


In [7]:
log_reg.score(X_train, y_train)


0.8041362530413625

In [8]:
log_reg.predict(X_test)
log_reg.score(X_test, y_test)


0.8106796116504854

### Manually Simulate Cross Validatation

#### - Manually split cheese into five equal subsets

In [37]:
cheese_index = cheese.index.to_list()
# np.random.seed(4)
np.random.shuffle(cheese_index)
cheese_index[:15]
df_split = np.array_split(cheese_index, 5)


#### - For each subset, treat subset as the test and the remaining data as the train
#### - Apply logisticRegression and save the train and predicted test scores

In [38]:
log_reg = LogisticRegression()

run = []
train = []
test = []
for j in range(5):
    run.append('run ' + str(j))
    X_test = cheese[cheese.index.isin(df_split[j])].drop(columns = 'FatLevel')
    y_test = cheese[cheese.index.isin(df_split[j])]['FatLevel']
    X_train = cheese[~cheese.index.isin(df_split[j])].drop(columns = 'FatLevel')
    y_train = cheese[~cheese.index.isin(df_split[j])]['FatLevel']
    log_reg.fit(X_train, y_train)
    train.append(log_reg.score(X_train, y_train))
    log_reg.predict(X_test)
    test.append(log_reg.score(X_test, y_test))


#### - Calculate the mean train and predicted test scores

In [39]:
results = {'run':run,
           'train_score':train,
           'test_score': test}
results_df = pd.DataFrame(results)
results_df[['train_score', 'test_score']].mean()


train_score    0.797417
test_score     0.794753
dtype: float64

### Using sklearn cross_validate()

In [40]:
X = cheese[[ 'MoisturePercent']]
y = cheese[[ 'FatLevel']]


In [41]:
scores = cross_validate(log_reg, X, y.values.ravel(), cv = 5, 
                        return_train_score = True)


In [42]:
scored_df = pd.DataFrame(scores)
scored_df


Unnamed: 0,fit_time,score_time,test_score,train_score
0,0.007232,0.00187,0.771845,0.760341
1,0.005428,0.001175,0.805825,0.805353
2,0.00309,0.000774,0.781553,0.811436
3,0.003118,0.001085,0.829268,0.799514
4,0.003845,0.001072,0.780488,0.811665


In [43]:
scored_df.mean()


fit_time       0.004543
score_time     0.001195
test_score     0.793796
train_score    0.797662
dtype: float64

#####
**Note:** cross_validate and the manual methods will not match exactly  as we cannot control the random method used by cross_validate to create the different subset

###
### Example to illustrate cross_val_scores

#### cross_val_scores are the test_scores from cross_validate

In [16]:
cross_val_scores = cross_val_score(log_reg, X, y.values.ravel(), cv = 5)


In [17]:
list(scored_df['test_score'])


[0.7718446601941747,
 0.8058252427184466,
 0.7815533980582524,
 0.8292682926829268,
 0.7804878048780488]

In [18]:
list(cross_val_scores)


[0.7718446601941747,
 0.8058252427184466,
 0.7815533980582524,
 0.8292682926829268,
 0.7804878048780488]

In [19]:
list(scored_df['test_score']) == list(cross_val_scores)


True