# Logistic Regression

In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import confusion_matrix, accuracy_score, cohen_kappa_score, plot_confusion_matrix, f1_score, recall_score, precision_score, classification_report 

**Importing Dataset**

In [22]:
cookies = pd.read_csv('../Data/cookies_clean.csv')

In [23]:
cookies.head(3)

Unnamed: 0.1,Unnamed: 0,sugar to flour ratio,sugar index,bake temp,chill time,calories,pH,grams baking soda,bake time,quality,butter type,weight,chocolate,raisins,oats,nuts,peanut butter
0,0,0.25,9.5,300,15.0,136.0,8.1,0.44,12.1,8,1,15.2,0,1,0,0,0
1,1,0.23,3.3,520,34.0,113.0,8.16,0.48,8.4,7,1,12.4,0,1,0,0,0
2,3,0.18,10.5,490,41.0,124.0,8.14,0.35,10.5,7,1,12.2,1,0,0,0,0


In [24]:
cookies.drop('Unnamed: 0', axis=1, inplace=True)

**Splitting data frame**

In [25]:
X = cookies.drop(['quality'], axis=1)
y = cookies['quality']

In [26]:
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.33, random_state=0)

In [27]:
#scaling the x values
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scale = scaler.transform(X_train)
X_test_scale = scaler.transform(X_test)

In [28]:
log_model = LogisticRegression(max_iter=700, multi_class='multinomial')

In [29]:
log_model.fit(X_train_scale, y_train)

LogisticRegression(max_iter=700, multi_class='multinomial')

In [30]:
y_pred = log_model.predict(X_test_scale)

In [31]:
def model_scores(y_test, y_pred):
    print('R2:', round(r2_score(y_test, y_pred), 3))
    print('Accuracy Score:', round(accuracy_score(y_test, y_pred), 3))
    print('Cohen Kappa Score:', round(cohen_kappa_score(y_test, y_pred), 3))
    print('MSE:', round(mean_squared_error(y_test, y_pred), 3))

In [32]:
model_scores(y_test, y_pred)

R2: 0.317
Accuracy Score: 0.57
Cohen Kappa Score: 0.333
MSE: 0.517


# KN Regressor

In [33]:
#with Standard Scaler
from sklearn.neighbors import KNeighborsRegressor

X = cookies.drop(['quality'], axis=1)
y = cookies['quality']

X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2, random_state=0)

#scaling the x values
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scale = scaler.transform(X_train)
X_test_scale = scaler.transform(X_test)

for k in range(1, 20):
    Knn = KNeighborsRegressor(n_neighbors = k)
    Knn.fit(X_train_scale, y_train)
    y_pred = Knn.predict(X_test_scale)
    print('For k=', k)
    print('R2:', round(r2_score(y_test, y_pred), 3))
    print('MSE:', round(mean_squared_error(y_test, y_pred), 3))
    print('')

For k= 1
R2: 0.369
MSE: 0.493

For k= 2
R2: 0.408
MSE: 0.463

For k= 3
R2: 0.436
MSE: 0.441

For k= 4
R2: 0.457
MSE: 0.424

For k= 5
R2: 0.46
MSE: 0.422

For k= 6
R2: 0.458
MSE: 0.423

For k= 7
R2: 0.455
MSE: 0.426

For k= 8
R2: 0.456
MSE: 0.425

For k= 9
R2: 0.46
MSE: 0.422

For k= 10
R2: 0.466
MSE: 0.417

For k= 11
R2: 0.474
MSE: 0.411

For k= 12
R2: 0.47
MSE: 0.414

For k= 13
R2: 0.467
MSE: 0.417

For k= 14
R2: 0.464
MSE: 0.419

For k= 15
R2: 0.469
MSE: 0.415

For k= 16
R2: 0.471
MSE: 0.413

For k= 17
R2: 0.468
MSE: 0.416

For k= 18
R2: 0.468
MSE: 0.416

For k= 19
R2: 0.468
MSE: 0.416



In [34]:
#with Standard Scaler
from sklearn.neighbors import KNeighborsRegressor

X = cookies.drop(['quality'], axis=1)
y = cookies['quality']

X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2, random_state=0)

#scaling the x values
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scale = scaler.transform(X_train)
X_test_scale = scaler.transform(X_test)


Knn = KNeighborsRegressor(metric='wminkowski', p=2, 
                           metric_params={'w': np.random.random(X_train.shape[1])})
Knn.fit(X_train_scale, y_train)
y_pred = Knn.predict(X_test_scale)
print('R2:', round(r2_score(y_test, y_pred), 3))
print('MSE:', round(mean_squared_error(y_test, y_pred), 3))
print('')

R2: 0.403
MSE: 0.466



In [35]:
# with MinMaxScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import MinMaxScaler

X = cookies.drop(['quality'], axis=1)
y = cookies['quality']

X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2, random_state=0)

#scaling the x values
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train_scale = scaler.transform(X_train)
X_test_scale = scaler.transform(X_test)

for k in range(1, 20):
    Knn = KNeighborsRegressor(n_neighbors = k)
    Knn.fit(X_train_scale, y_train)
    y_pred = Knn.predict(X_test_scale)
    print('For k=', k)
    print('R2:', round(r2_score(y_test, y_pred), 3))
    print('MSE:', round(mean_squared_error(y_test, y_pred), 3))
    print('')

For k= 1
R2: 0.341
MSE: 0.515

For k= 2
R2: 0.422
MSE: 0.452

For k= 3
R2: 0.432
MSE: 0.444

For k= 4
R2: 0.437
MSE: 0.44

For k= 5
R2: 0.446
MSE: 0.433

For k= 6
R2: 0.459
MSE: 0.423

For k= 7
R2: 0.463
MSE: 0.42

For k= 8
R2: 0.467
MSE: 0.417

For k= 9
R2: 0.466
MSE: 0.418

For k= 10
R2: 0.465
MSE: 0.418

For k= 11
R2: 0.465
MSE: 0.418

For k= 12
R2: 0.471
MSE: 0.414

For k= 13
R2: 0.473
MSE: 0.412

For k= 14
R2: 0.472
MSE: 0.413

For k= 15
R2: 0.473
MSE: 0.412

For k= 16
R2: 0.472
MSE: 0.412

For k= 17
R2: 0.473
MSE: 0.412

For k= 18
R2: 0.473
MSE: 0.412

For k= 19
R2: 0.469
MSE: 0.415



In [36]:
#with Standard Scaler and dropping more columns
from sklearn.neighbors import KNeighborsRegressor

X = cookies.drop(['quality'], axis=1)
y = cookies['quality']

X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2, random_state=0)

#scaling the x values
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scale = scaler.transform(X_train)
X_test_scale = scaler.transform(X_test)

for k in range(1, 20):
    Knn = KNeighborsRegressor(n_neighbors = k)
    Knn.fit(X_train_scale, y_train)
    y_pred = Knn.predict(X_test_scale)
    print('For k=', k)
    print('R2:', round(r2_score(y_test, y_pred), 3))
    print('MSE:', round(mean_squared_error(y_test, y_pred), 3))
    print('')

For k= 1
R2: 0.369
MSE: 0.493

For k= 2
R2: 0.408
MSE: 0.463

For k= 3
R2: 0.436
MSE: 0.441

For k= 4
R2: 0.457
MSE: 0.424

For k= 5
R2: 0.46
MSE: 0.422

For k= 6
R2: 0.458
MSE: 0.423

For k= 7
R2: 0.455
MSE: 0.426

For k= 8
R2: 0.456
MSE: 0.425

For k= 9
R2: 0.46
MSE: 0.422

For k= 10
R2: 0.466
MSE: 0.417

For k= 11
R2: 0.474
MSE: 0.411

For k= 12
R2: 0.47
MSE: 0.414

For k= 13
R2: 0.467
MSE: 0.417

For k= 14
R2: 0.464
MSE: 0.419

For k= 15
R2: 0.469
MSE: 0.415

For k= 16
R2: 0.471
MSE: 0.413

For k= 17
R2: 0.468
MSE: 0.416

For k= 18
R2: 0.468
MSE: 0.416

For k= 19
R2: 0.468
MSE: 0.416



**Applying best model to new data frame**

In [37]:
#best KNN model with k = 8:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import MinMaxScaler

X = cookies.drop(['quality'], axis=1)
y = cookies['quality']

X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2, random_state=0)

#scaling the x values
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train_scale = scaler.transform(X_train)
X_test_scale = scaler.transform(X_test)

Knn = KNeighborsRegressor(n_neighbors = 8)
Knn.fit(X_train_scale, y_train)
y_pred = Knn.predict(X_test_scale)
print('R2:', round(r2_score(y_test, y_pred), 3))
print('MSE:', round(mean_squared_error(y_test, y_pred), 3))
print('')

R2: 0.467
MSE: 0.417



In [41]:
test = pd.read_csv('../Data/cookies_validate.csv')

In [42]:
#dropping
test.drop('id', axis=1, inplace=True)
test.drop('quality', axis=1, inplace=True)
test.drop(['density', 'crunch factor', 'aesthetic appeal'], axis=1, inplace=True)

#making dummies
#butter types
test['butter type'] = test['butter type'].replace('melted', 1).replace('cubed', 0)

#mixins
mixins_list = ['chocolate', 'raisins', 'oats', 'nuts', 'peanut butter']

for x in mixins_list:
    test[x] = 0
    test[x] = test['mixins'].str.contains(x).astype(int)

test.drop('mixins', axis=1, inplace=True)

# Linear Regression

In [44]:
from sklearn.linear_model import LinearRegression

X = cookies.drop(['quality'], axis=1)
y = cookies['quality']

X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2, random_state=0)

#scaling the x values
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scale = scaler.transform(X_train)
X_test_scale = scaler.transform(X_test)

reg = LinearRegression()
reg.fit(X_train_scale, y_train)

y_pred = reg.predict(X_test_scale)
y_test_predict= RFR.predict(X_test)

print('R2:', round(r2_score(y_test, y_pred), 3))
print('RMSE:', np.sqrt(mean_squared_error(y_test, y_pred)))

R2: 0.463
RMSE: 0.6476841166543369


# Lasso Regression

In [47]:
from sklearn import linear_model

X = cookies.drop(['quality'], axis=1)
y = cookies['quality']

X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2, random_state=0)

#scaling the x values
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scale = scaler.transform(X_train)
X_test_scale = scaler.transform(X_test)

model_lasso = linear_model.Lasso(alpha=0.1)
model_lasso.fit(X_train_scale, y_train) 
pred_train_lasso= model_lasso.predict(X_train_scale)
print('MSE:', mean_squared_error(y_train,pred_train_lasso))
print('R2 Score:', r2_score(y_train, pred_train_lasso))
print('')
pred_test_lasso= model_lasso.predict(X_test_scale)
print('MSE:', np.sqrt(mean_squared_error(y_test,pred_test_lasso))) 
print('R2 Score:', r2_score(y_test, pred_test_lasso))

MSE: 0.48490618680269837
R2 Score: 0.36892675729320945

MSE: 0.6931989983754325
R2 Score: 0.38516132557588123


In [48]:
from sklearn import linear_model

X = cookies.drop(['quality'], axis=1)
y = cookies['quality']

X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2, random_state=0)

#scaling the x values
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train_scale = scaler.transform(X_train)
X_test_scale = scaler.transform(X_test)

model_lasso = linear_model.Lasso(alpha=0.1)
model_lasso.fit(X_train_scale, y_train) 
pred_train_lasso= model_lasso.predict(X_train_scale)
print('MSE:', mean_squared_error(y_train,pred_train_lasso))
print('R2 Score:', r2_score(y_train, pred_train_lasso))
print('')
pred_test_lasso= model_lasso.predict(X_test_scale)
print('MSE:', mean_squared_error(y_test,pred_test_lasso)) 
print('R2 Score:', r2_score(y_test, pred_test_lasso))

MSE: 0.690929161269085
R2 Score: 0.1008015196552281

MSE: 0.7000303851541965
R2 Score: 0.10430073937533246


# Ridge Regression

In [49]:
from sklearn.linear_model import Ridge

X = cookies.drop(['quality'], axis=1)
y = cookies['quality']

X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2, random_state=0)

#scaling the x values
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scale = scaler.transform(X_train)
X_test_scale = scaler.transform(X_test)

rr = Ridge(alpha=0.01)
rr.fit(X_train_scale, y_train) 
pred_train_rr= rr.predict(X_train_scale)
print('MSE:', mean_squared_error(y_train,pred_train_rr))
print('R2 Score:', r2_score(y_train, pred_train_rr))

print('')

pred_test_rr= rr.predict(X_test_scale)
print('MSE:', mean_squared_error(y_test,pred_test_rr)) 
print('R2 Score:', r2_score(y_test, pred_test_rr))

MSE: 0.42664162078572304
R2 Score: 0.44475422580558255

MSE: 0.4194948354177713
R2 Score: 0.46325013615401944


In [50]:
from sklearn.linear_model import Ridge

X = cookies.drop(['quality'], axis=1)
y = cookies['quality']

X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2, random_state=0)

#scaling the x values
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train_scale = scaler.transform(X_train)
X_test_scale = scaler.transform(X_test)

rr = Ridge(alpha=0.01)
rr.fit(X_train_scale, y_train) 
pred_train_rr= rr.predict(X_train_scale)
print('MSE:', mean_squared_error(y_train,pred_train_rr))
print('R2 Score:', r2_score(y_train, pred_train_rr))

print('')

pred_test_rr= rr.predict(X_test_scale)
print('MSE:', mean_squared_error(y_test,pred_test_rr)) 
print('R2 Score:', r2_score(y_test, pred_test_rr))

MSE: 0.4266416234706396
R2 Score: 0.44475422231134176

MSE: 0.41949718747848025
R2 Score: 0.4632471266574566


# ElasticNet Regression

In [51]:
from sklearn.linear_model import ElasticNet

X = cookies.drop(['quality'], axis=1)
y = cookies['quality']

X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2, random_state=0)

#scaling the x values
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scale = scaler.transform(X_train)
X_test_scale = scaler.transform(X_test)

model_enet = ElasticNet(alpha = 0.01)
model_enet.fit(X_train_scale, y_train) 
pred_train_enet= model_enet.predict(X_train_scale)

print('MSE:', mean_squared_error(y_train,pred_train_enet))
print('R2 Score:', r2_score(y_train,pred_train_enet))

print('')

pred_test_rr= rr.predict(X_test_scale)
print('MSE:', mean_squared_error(y_train,pred_train_enet)) 
print('R2 Score:', r2_score(y_train,pred_train_enet))

MSE: 0.4273594870150078
R2 Score: 0.44381997051770605

MSE: 0.4273594870150078
R2 Score: 0.44381997051770605


In [52]:
from sklearn.linear_model import ElasticNet

X = cookies.drop(['quality'], axis=1)
y = cookies['quality']

X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2, random_state=0)

#scaling the x values
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train_scale = scaler.transform(X_train)
X_test_scale = scaler.transform(X_test)

model_enet = ElasticNet(alpha = 0.01)
model_enet.fit(X_train_scale, y_train) 
pred_train_enet= model_enet.predict(X_train_scale)

print('MSE:', mean_squared_error(y_train,pred_train_enet))
print('R2 Score:', r2_score(y_train,pred_train_enet))

print('')

pred_test_rr= rr.predict(X_test_scale)
print('MSE:', mean_squared_error(y_train,pred_train_enet)) 
print('R2 Score:', r2_score(y_train,pred_train_enet))

MSE: 0.43913806740455
R2 Score: 0.4284909293068224

MSE: 0.43913806740455
R2 Score: 0.4284909293068224


# Random Forrests

In [53]:
from sklearn.ensemble import RandomForestRegressor

X = cookies.drop(['quality'], axis=1)
y = cookies['quality']

X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2, random_state=0)

RFR = RandomForestRegressor(max_depth=14.9)

RFR.fit(X_train, y_train)

y_predict = RFR.predict(X_train)
y_test_predict= RFR.predict(X_test)

print('R2 Score', r2_score(y_test, y_test_predict))
print('RMSE:', np.sqrt(mean_squared_error(y_test, y_test_predict)))

R2 Score 0.5886771993170178
RMSE: 0.5669812928048982


In [54]:
quality_pred = RFR.predict(test)
quality_pred

ValueError: Number of features of the model must match the input. Model n_features is 15 and input n_features is 16 

In [None]:
test['quality_pred'] = quality_pred

In [None]:
test

In [None]:
test.to_csv('../Data/firsttry_sarahlisa.csv')

In [None]:
cookies['quality'].value_counts()