In [1]:
from sklearn.datasets import load_boston

In [7]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [8]:
df = load_boston()

In [19]:
len(df.data[0])

13

In [34]:
dataFrame = pd.DataFrame(data = df.data,columns = df.feature_names)
dataFrame["Price"] = df.target

In [35]:
dataFrame.sample(5)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,Price
215,0.19802,0.0,10.59,0.0,0.489,6.182,42.4,3.9454,4.0,277.0,18.6,393.63,9.47,25.0
274,0.05644,40.0,6.41,1.0,0.447,6.758,32.9,4.0776,4.0,254.0,17.6,396.9,3.53,32.4
344,0.03049,55.0,3.78,0.0,0.484,6.874,28.1,6.4654,5.0,370.0,17.6,387.97,4.61,31.2
171,2.3139,0.0,19.58,0.0,0.605,5.88,97.3,2.3887,5.0,403.0,14.7,348.13,12.03,19.1
302,0.09266,34.0,6.09,0.0,0.433,6.495,18.4,5.4917,7.0,329.0,16.1,383.61,8.67,26.4


In [38]:
dataFrame.columns

Index(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
       'PTRATIO', 'B', 'LSTAT', 'Price'],
      dtype='object')

In [41]:
# independent feature
X = dataFrame.iloc[:,:-1]
# dependent feature
Y = dataFrame.iloc[:,-1]

In [87]:
print(f"number of 1s :{y.value_counts().iloc[0]}")
print(f"number of 0s :{y.value_counts().iloc[1]}")

number of 1s :357
number of 0s :212


In [83]:
y.value_counts().iloc[0]

357

### Linear Regression

#### neg_mean_squared_error

###### Calculation performed
Suppose we have the following actual target values and corresponding predicted values:

Actual values: [5, 8, 12, 15]
Predicted values: [6, 9, 11, 14]

To calculate the mean squared error (MSE), we first compute the squared difference between each actual and predicted value:

Squared differences: [(5-6)^2, (8-9)^2, (12-11)^2, (15-14)^2]
[1, 1, 1, 1]

Next, we calculate the average of these squared differences:

MSE = (1 + 1 + 1 + 1) / 4 = 1

Finally, to obtain the negative mean squared error, we negate the MSE:

Negative MSE = -1

###### Explaination

The thing is that GridSearchCV, by convention, always tries to maximize its score so loss functions like MSE have to be negated.The unified scoring API always maximizes the score, so scores which need to be minimized are negated in order for the unified scoring API to work correctly. The score that is returned is therefore negated when it is a score that should be minimized and left positive if it is a score that should be maximized.
The Mean Square Error returned by sklearn.cross_validation.cross_val_score is always a negative. While being a designed decision so that the output of this function can be used for maximization given some hyperparameters, it's extremely confusing when using cross_val_score directly. At least I asked myself how a the mean of a square can possibly be negative and thought that cross_val_score was not working correctly or did not use the supplied metric. Only after digging in the sklearn source code I realized that the sign was flipped.

In [44]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

In [60]:
LR = LinearRegression()

# Performing 5 crossvalidations
# default scoring metric is the coefficient of determination or R-squared (scoring="r2")
mse = cross_val_score(LR,X,Y,scoring = "neg_mean_squared_error",cv=5)

# Finding avg score of all the cross validations
mean_mse = np.mean(mse)

# -ve sign id because of the arguement neg_mean_squared_error
print(mean_mse)

-37.131807467699296


In [None]:
LR.fit()
LR.predict()

### Ridge Regression

In [48]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

ridge = Ridge()
# max_iter --> used for telling the max no of itterationd you can perform to 
# change the theta value

In [57]:
# Defining params to be changed

# Here alpha you should relate to Ridge regularisation / L2 regularisation
params={'alpha':[1e-15,1e-10,1e-8,1e-3,1e-2,1,5,10,20]}
ridge_regressor=GridSearchCV(ridge,params,scoring="neg_mean_squared_error",cv = 5)
ridge_regressor.fit(X,Y)

GridSearchCV(cv=5, estimator=Ridge(),
             param_grid={'alpha': [1e-15, 1e-10, 1e-08, 0.001, 0.01, 1, 5, 10,
                                   20]},
             scoring='neg_mean_squared_error')

In [58]:
print(ridge_regressor.best_params_)
# Because of 
print(ridge_regressor.best_score_)

{'alpha': 20}
-32.380250251825125


### Lasso Regression 

In [63]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
lasso = Lasso()
# max_iter --> used for telling the max no of itterationd you can perform to 
# change the theta value

# Defining params to be changed
# Here alpha you should relate to Lasso regularisation / L1 regularisation
params={'alpha':[1e-15,1e-10,1e-8,1e-3,1e-2,1,5,10,20]}

lasso_regressor = GridSearchCV(lasso,params,scoring="neg_mean_squared_error",cv = 5)
lasso_regressor.fit(X,Y)

  positive)
  positive)
  positive)
  positive)
  positive)


GridSearchCV(cv=5, estimator=Lasso(),
             param_grid={'alpha': [1e-15, 1e-10, 1e-08, 0.001, 0.01, 1, 5, 10,
                                   20]},
             scoring='neg_mean_squared_error')

In [65]:
print(lasso_regressor.best_params_)
# Because of 
print(lasso_regressor.best_score_)

{'alpha': 1}
-35.53158022069485


### Difference between Ridge Lasso and Normal Regression
* Normal Regression (Ordinary Least Squares): Normal regression, also known as ordinary least squares (OLS) regression, aims to minimize the sum of squared residuals between the observed and predicted values. It assumes that all features are relevant and assigns equal importance to all of them. Normal regression can be sensitive to multicollinearity, meaning highly correlated predictor variables can impact the model's performance.

* Ridge Regression: Ridge regression is a technique that addresses multicollinearity by adding a penalty term to the loss function. This penalty term (L2 regularization) controls the complexity of the model by shrinking the coefficient values towards zero. Ridge regression can help reduce overfitting and stabilize the model by reducing the impact of less important predictors. It is particularly useful when dealing with a high number of correlated predictors.

* Lasso Regression: Lasso regression (Least Absolute Shrinkage and Selection Operator) also addresses multicollinearity but uses a different penalty term (L1 regularization). Lasso regression not only shrinks coefficient values but can also perform variable selection by driving some coefficients to exactly zero. This means that Lasso regression can automatically exclude irrelevant features from the model, providing a more interpretable and sparse solution. It is particularly effective when dealing with high-dimensional datasets with many irrelevant or redundant predictors.

### Logistic Regression
* class_weightdict or ‘balanced’, default=None {Main usage is in imbalanced datasets}

In [69]:
from sklearn.linear_model import LogisticRegression

In [66]:
from sklearn.datasets import load_breast_cancer
df=load_breast_cancer() 
### Independent features 
X=pd.DataFrame(df['data'], columns=df['feature_names']) 

In [67]:
X.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [70]:
## Dependent Feature 
y=pd.DataFrame(df[ 'target'],columns=["Target"])

In [89]:
# Checking for balanced / imbalanced dataset
y.value_counts()

Target
1         357
0         212
dtype: int64

In [90]:
## Train test split

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
params=[{'C':[1,5,10]},{'max_iter' :[100,150]}]

In [94]:
model1=LogisticRegression(C=100,max_iter=100)
model=GridSearchCV(model1,param_grid=params,scoring='f1',cv=5)
model. fit(X_train,y_train)

  return f(*args, **kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  return f(*args, **kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  return f(*args, **kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative 

  return f(*args, **kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  return f(*args, **kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  return f(*args, **kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative 

GridSearchCV(cv=5, estimator=LogisticRegression(C=100),
             param_grid=[{'C': [1, 5, 10]}, {'max_iter': [100, 150]}],
             scoring='f1')

In [96]:
model.best_params_

{'max_iter': 150}

In [98]:
model.best_score_

0.9599326611113146

In [99]:
y_pred=model.predict(X_test)

In [100]:
y_pred

array([1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1,
       0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1,
       0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1])

In [101]:
from sklearn.metrics import confusion_matrix,classification_report

In [102]:
confusion_matrix(y_test,y_pred)

array([[ 64,   3],
       [  4, 117]], dtype=int64)

In [104]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.9627659574468085

In [105]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.94      0.96      0.95        67
           1       0.97      0.97      0.97       121

    accuracy                           0.96       188
   macro avg       0.96      0.96      0.96       188
weighted avg       0.96      0.96      0.96       188

