## Import Libraries

In [28]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer, load_wine
from sklearn.metrics import mean_absolute_error, mean_squared_error, accuracy_score, log_loss

## Import Data

In [29]:
data = load_breast_cancer()
set(data['target'])

{0, 1}

## Handling the data 

In [30]:
x = pd.DataFrame(data['data'],columns=data['feature_names'])
y = data['target']
print (x.shape, y.shape)

(569, 30) (569,)


In [31]:
pd.DataFrame(y)[0].value_counts()

1    357
0    212
Name: 0, dtype: int64

## Split

In [32]:
from sklearn.model_selection import train_test_split
# Train - Test
X, x_test, Y, y_test = train_test_split(x, y, test_size = 0.1, random_state = 0) 
# Train - Validation
x_train, x_valid, y_train, y_valid = train_test_split(X,Y, test_size = 0.1, random_state = 0) 

# Underfitting

#### Fit

In [33]:
from sklearn.linear_model import LogisticRegression
# Fit
model1 = LogisticRegression(max_iter=1, solver = 'liblinear')
model1.fit(x_train, y_train)

LogisticRegression(max_iter=1, solver='liblinear')

#### Prediction

In [34]:
y_tr = model1.predict(x_train) ; y_tr_ = model1.predict_proba(x_train)
y_pred = model1.predict(x_valid);y_pred_ = model1.predict_proba(x_valid)

#### Evaluation

In [35]:
print ('Train - ACC:',accuracy_score(y_tr, y_train),'\tlog_loss:',log_loss(y_train, y_tr_))
print ('Train - MSE:',mean_squared_error(y_tr, y_train),'\tMAE:',mean_absolute_error(y_tr, y_train))

Train - ACC: 0.8521739130434782 	log_loss: 0.5119210488714548
Train - MSE: 0.14782608695652175 	MAE: 0.14782608695652175


In [36]:
print ('Valid - ACC:',accuracy_score(y_pred, y_valid),'\tlog_loss:',log_loss(y_valid, y_pred_))
print ('Valid - MSE:',mean_squared_error(y_pred, y_valid),'\tMAE:',mean_absolute_error(y_pred, y_valid))

Valid - ACC: 0.9038461538461539 	log_loss: 0.47902080161576205
Valid - MSE: 0.09615384615384616 	MAE: 0.09615384615384616


## How to fix underfitting!

**First Solution: Increasing Number of Iterations**

In [40]:
# Fit
# Changing the number of iterations from 1 to 100
model1 = LogisticRegression(max_iter=100, solver = 'liblinear')
model1.fit(x_train, y_train)

#Prediction
y_tr = model1.predict(x_train) ; y_tr_ = model1.predict_proba(x_train)
y_pred = model1.predict(x_valid);y_pred_ = model1.predict_proba(x_valid)

#Evaluation
print ('Train - ACC:',accuracy_score(y_tr, y_train),'\tlog_loss:',log_loss(y_train, y_tr_))
print ('Train - MSE:',mean_squared_error(y_tr, y_train),'\tMAE:',mean_absolute_error(y_tr, y_train))

print ('Valid - ACC:',accuracy_score(y_pred, y_valid),'\tlog_loss:',log_loss(y_valid, y_pred_))
print ('Valid - MSE:',mean_squared_error(y_pred, y_valid),'\tMAE:',mean_absolute_error(y_pred, y_valid))

Train - ACC: 0.9478260869565217 	log_loss: 0.10454861234457294
Train - MSE: 0.05217391304347826 	MAE: 0.05217391304347826
Valid - ACC: 0.9807692307692307 	log_loss: 0.05077851299447521
Valid - MSE: 0.019230769230769232 	MAE: 0.019230769230769232


**Second Solution: Reducing The Number of Features**

In [41]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

x = pd.DataFrame(data['data'],columns=data['feature_names'])
y = data['target']
print (x.shape, y.shape)

#reducing features by half
x_new = SelectKBest(chi2, k=15).fit_transform(x, y)
print(x_new.shape)

# Train - Test
X, x_test, Y, y_test = train_test_split(x_new, y, test_size = 0.1, random_state = 0) 
# Train - Validation
x_train, x_valid, y_train, y_valid = train_test_split(X,Y, test_size = 0.1, random_state = 0) 

#fit
model1 = LogisticRegression(max_iter=100, solver = 'liblinear')
model1.fit(x_train, y_train)

#Prediction
y_tr = model1.predict(x_train) ; y_tr_ = model1.predict_proba(x_train)
y_pred = model1.predict(x_valid);y_pred_ = model1.predict_proba(x_valid)

#Evaluation
print ('Train - ACC:',accuracy_score(y_tr, y_train),'\tlog_loss:',log_loss(y_train, y_tr_))
print ('Train - MSE:',mean_squared_error(y_tr, y_train),'\tMAE:',mean_absolute_error(y_tr, y_train))

print ('Valid - ACC:',accuracy_score(y_pred, y_valid),'\tlog_loss:',log_loss(y_valid, y_pred_))
print ('Valid - MSE:',mean_squared_error(y_pred, y_valid),'\tMAE:',mean_absolute_error(y_pred, y_valid))


(569, 30) (569,)
(569, 15)
Train - ACC: 0.9478260869565217 	log_loss: 0.10454861234457294
Train - MSE: 0.05217391304347826 	MAE: 0.05217391304347826
Valid - ACC: 0.9807692307692307 	log_loss: 0.05077851299447521
Valid - MSE: 0.019230769230769232 	MAE: 0.019230769230769232
