## Classification Algorithm
## Logistic Regression - Grid

In [1]:
import pandas as pd

### Data collection

In [3]:
dataset = pd.read_csv('Social_Network_Ads.csv')

In [4]:
dataset

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0
...,...,...,...,...,...
395,15691863,Female,46,41000,1
396,15706071,Male,51,23000,1
397,15654296,Female,50,20000,1
398,15755018,Male,36,33000,0


### Data preprocessing

In [5]:
dataset=pd.get_dummies(dataset,dtype=int,drop_first=True)

In [6]:
dataset

Unnamed: 0,User ID,Age,EstimatedSalary,Purchased,Gender_Male
0,15624510,19,19000,0,1
1,15810944,35,20000,0,1
2,15668575,26,43000,0,0
3,15603246,27,57000,0,0
4,15804002,19,76000,0,1
...,...,...,...,...,...
395,15691863,46,41000,1,0
396,15706071,51,23000,1,1
397,15654296,50,20000,1,0
398,15755018,36,33000,0,1


In [7]:
dataset.shape

(400, 5)

In [8]:
dataset = dataset.drop('User ID', axis=1)

### Check the No of Unique classes and its count

In [9]:
dataset['Purchased'].value_counts()

Purchased
0    257
1    143
Name: count, dtype: int64

### Input Output Split

In [10]:
indep = dataset[['Age','EstimatedSalary','Gender_Male']]
dep = dataset[['Purchased']]

### Train Test Split

In [11]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(indep,dep,test_size=0.20,random_state=0)

### Standardization

In [12]:
from sklearn.preprocessing import StandardScaler
SC = StandardScaler()
X_train = SC.fit_transform(X_train)
X_test = SC.transform(X_test)

In [13]:
X_train

array([[ 1.92295008e+00,  2.14601566e+00,  1.02532046e+00],
       [ 2.02016082e+00,  3.78719297e-01, -9.75304830e-01],
       [-1.38221530e+00, -4.32498705e-01, -9.75304830e-01],
       [-1.18779381e+00, -1.01194013e+00, -9.75304830e-01],
       [ 1.92295008e+00, -9.25023920e-01, -9.75304830e-01],
       [ 3.67578135e-01,  2.91803083e-01, -9.75304830e-01],
       [ 1.73156642e-01,  1.46942725e-01, -9.75304830e-01],
       [ 2.02016082e+00,  1.74040666e+00,  1.02532046e+00],
       [ 7.56421121e-01, -8.38107706e-01, -9.75304830e-01],
       [ 2.70367388e-01, -2.87638347e-01, -9.75304830e-01],
       [ 3.67578135e-01, -1.71750061e-01,  1.02532046e+00],
       [-1.18475597e-01,  2.20395980e+00, -9.75304830e-01],
       [-1.47942605e+00, -6.35303205e-01, -9.75304830e-01],
       [-1.28500455e+00, -1.06988428e+00,  1.02532046e+00],
       [-1.38221530e+00,  4.07691369e-01,  1.02532046e+00],
       [-1.09058306e+00,  7.55356227e-01, -9.75304830e-01],
       [-1.47942605e+00, -2.00722133e-01

### Train Set

### Model Creation in GridSearchCV

In [14]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

# param_grid - This enables searching over any sequence of parameter settings.
# Scoring - Strategy to evaluate the performance of the cross-validated model on the test set.
# n_jobs - -1 means using all processors
# CV - Determines the cross-validation splitting strategy

param_grid = {'penalty' : ['l1', 'l2', 'elasticnet'],
              'solver' : ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'],
              'multi_class': ['auto', 'ovr', 'multinomial']}

grid = GridSearchCV(LogisticRegression(), param_grid, refit = True,verbose = 3, n_jobs=-1,scoring = 'f1_weighted')

grid.fit(X_train,Y_train)

Fitting 5 folds for each of 54 candidates, totalling 270 fits


165 fits failed out of a total of 270.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Anaconda3\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1172, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Anaconda3\Lib

### Test Set

### Evaluation Metrics 
### Confusion Matix
### Classification Report

In [15]:
# print the best parameter after tuning
results = grid.cv_results_

grid_predictions = grid.predict(X_test)

from sklearn.metrics import confusion_matrix
CM = confusion_matrix(Y_test,grid_predictions)

from sklearn.metrics import classification_report
Clf_report = classification_report(Y_test,grid_predictions)

In [16]:
print("Confusion Matrix:\n",CM)

Confusion Matrix:
 [[56  2]
 [ 4 18]]


In [17]:
print("Classification Report:\n",Clf_report)

Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.97      0.95        58
           1       0.90      0.82      0.86        22

    accuracy                           0.93        80
   macro avg       0.92      0.89      0.90        80
weighted avg       0.92      0.93      0.92        80



### ADDITIONAL : Check for the best params & f1_score 

In [18]:
from sklearn.metrics import f1_score

# Parameters : average{‘micro’, ‘macro’, ‘samples’, ‘weighted’, ‘binary
f1_macro_ave = f1_score(Y_test,grid_predictions, average='macro')

print('f1_macro_value of best parameter{}:'.format(grid.best_params_),f1_macro_ave)

f1_macro_value of best parameter{'multi_class': 'auto', 'penalty': 'l1', 'solver': 'liblinear'}: 0.9031476997578692


### ADDITIONAL : Check roc_auc_score -Receiver Operating Characterstic (Area Under Curve)

In [19]:
# ROC_AUC : Is a performance metric used primarily for binary classification problems to evaluate the quality of a classification model.

from sklearn.metrics import roc_auc_score
roc_auc_score(Y_test,grid.predict_proba(X_test)[:,1])

0.9780564263322884

### Tabulation of Parameters in GridSearchCV

In [20]:
Table=pd.DataFrame(results)
Table

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_multi_class,param_penalty,param_solver,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.003055,0.000327,0.0,0.0,auto,l1,lbfgs,"{'multi_class': 'auto', 'penalty': 'l1', 'solv...",,,,,,,,22
1,0.012179,0.001408,0.024789,0.007302,auto,l1,liblinear,"{'multi_class': 'auto', 'penalty': 'l1', 'solv...",0.828694,0.808442,0.726744,0.83804,0.920683,0.82452,0.062074,1
2,0.002506,0.001335,0.0,0.0,auto,l1,newton-cg,"{'multi_class': 'auto', 'penalty': 'l1', 'solv...",,,,,,,,22
3,0.002626,0.000751,0.0,0.0,auto,l1,newton-cholesky,"{'multi_class': 'auto', 'penalty': 'l1', 'solv...",,,,,,,,22
4,0.002648,0.000855,0.0,0.0,auto,l1,sag,"{'multi_class': 'auto', 'penalty': 'l1', 'solv...",,,,,,,,22
5,0.012227,0.002486,0.017669,0.002446,auto,l1,saga,"{'multi_class': 'auto', 'penalty': 'l1', 'solv...",0.828694,0.79104,0.726744,0.83804,0.920683,0.82104,0.063353,17
6,0.015899,0.003038,0.019908,0.002458,auto,l2,lbfgs,"{'multi_class': 'auto', 'penalty': 'l2', 'solv...",0.828694,0.79104,0.740864,0.83804,0.920683,0.823864,0.05927,3
7,0.008898,0.002048,0.021245,0.004002,auto,l2,liblinear,"{'multi_class': 'auto', 'penalty': 'l2', 'solv...",0.828694,0.79104,0.726744,0.83804,0.920683,0.82104,0.063353,17
8,0.016624,0.004112,0.020508,0.003733,auto,l2,newton-cg,"{'multi_class': 'auto', 'penalty': 'l2', 'solv...",0.828694,0.79104,0.740864,0.83804,0.920683,0.823864,0.05927,3
9,0.036825,0.007346,0.015311,0.002208,auto,l2,newton-cholesky,"{'multi_class': 'auto', 'penalty': 'l2', 'solv...",0.828694,0.79104,0.740864,0.83804,0.920683,0.823864,0.05927,3


### Check the Model by getting User Input

In [21]:
Age_input = int(input("AGE: "))
Salary_input = int(input("Salary: "))
Gender_input = int(input("Gender Male 0 or 1: "))

AGE:  50
Salary:  20000
Gender Male 0 or 1:  1


### Prediction

In [22]:
Prediction=grid.predict([[Age_input,Salary_input,Gender_input]]) 
Prediction


array([1], dtype=int64)