In [1]:
#importing the Libraries
import pandas as pd

In [2]:
#Reading the Dataset
dataset=pd.read_csv("Social_Network_Ads.csv")

In [3]:
#Convert categorical(nominal or ordinal) data into numerical data and delete first column
dataset=pd.get_dummies(dataset,drop_first=True)

In [4]:
dataset.drop('User ID', axis=1)#without "axis", it throws error after execution
#axis=0 means row vice. axis=1 means column vice. We should delete column vice only

Unnamed: 0,Age,EstimatedSalary,Purchased,Gender_Male
0,19,19000,0,1
1,35,20000,0,1
2,26,43000,0,0
3,27,57000,0,0
4,19,76000,0,1
...,...,...,...,...
395,46,41000,1,0
396,51,23000,1,1
397,50,20000,1,0
398,36,33000,0,1


In [5]:
independent=dataset[['Age', 'EstimatedSalary', 'Gender_Male']]

In [6]:
dependent=dataset[['Purchased']]

In [7]:
#in Regression - split into training set and test set is not needed for Grid
#But in Classification - split into training set and test set should be needed for Grid
#Because, we can see, matrix as well as classificatio report
#and also, type 1 error, type 2 error, etc., needs to check
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(independent,dependent,test_size=0.30,random_state=0)

In [8]:
#Convert the data into standardized data
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
X_train=sc.fit_transform(X_train)
X_test-sc.transform(X_test)

Unnamed: 0,Age,EstimatedSalary,Gender_Male
132,30.771013,86999.502799,0.007118
309,37.986695,50000.572804,1.007169
341,35.280814,74999.849828,0.007118
196,30.771013,78999.734151,1.007169
246,35.280814,50000.572804,1.007169
...,...,...,...
216,47.908257,65000.139018,0.007118
259,44.300416,130998.230361,1.007169
49,31.672973,88999.444961,1.007169
238,45.202376,81999.647394,1.007169


In [9]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
param_grid = {'solver':['newton-cg', 'lbfgs', 'saga', 'liblinear'],'penalty':['l2']}
grid = GridSearchCV(LogisticRegression(), param_grid, refit=True, verbose=3, n_jobs=-1, scoring='f1_weighted')
#Here, refit=True is used to store best model in 'grid' among all the combinations
#Here, scoring='f1_weighted' is used. Because, its suitable for balanced and imbalanced data as well as precision, recall etc.,
#fitting the model for grid search
grid.fit(X_train,Y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


  y = column_or_1d(y, warn=True)


GridSearchCV(estimator=LogisticRegression(), n_jobs=-1,
             param_grid={'penalty': ['l2'],
                         'solver': ['newton-cg', 'lbfgs', 'saga', 'liblinear']},
             scoring='f1_weighted', verbose=3)

In [10]:
# print best parameter after tuning. Format is print(grid.best_params_)
result=grid.cv_results_
print("The value for best parameter {}:".format(grid.best_params_))

The value for best parameter {'penalty': 'l2', 'solver': 'newton-cg'}:


In [11]:
#to view entire result table
table=pd.DataFrame.from_dict(result)
table

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_penalty,param_solver,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.199995,0.093436,0.028145,0.006261,l2,newton-cg,"{'penalty': 'l2', 'solver': 'newton-cg'}",0.835985,0.802399,0.644599,0.927778,0.890114,0.820175,0.097839,1
1,0.100017,0.042632,0.003125,0.00625,l2,lbfgs,"{'penalty': 'l2', 'solver': 'lbfgs'}",0.835985,0.802399,0.644599,0.927778,0.890114,0.820175,0.097839,1
2,0.012498,0.01169,0.00625,0.007655,l2,saga,"{'penalty': 'l2', 'solver': 'saga'}",0.835985,0.802399,0.644599,0.927778,0.890114,0.820175,0.097839,1
3,0.031248,0.017115,0.0,0.0,l2,liblinear,"{'penalty': 'l2', 'solver': 'liblinear'}",0.835985,0.802399,0.644599,0.927778,0.890114,0.820175,0.097839,1


In [12]:
y_pred=grid.predict(X_test)

  f"X has feature names, but {self.__class__.__name__} was fitted without"


In [13]:
from sklearn.metrics import confusion_matrix#instead of r2_score we use confusion_matrix here
cm=confusion_matrix(Y_test,y_pred)
cm

array([[ 0, 79],
       [ 0, 41]], dtype=int64)

In [14]:
from sklearn.metrics import classification_report
clf_report=classification_report(Y_test,y_pred)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [15]:
print(clf_report)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        79
           1       0.34      1.00      0.51        41

    accuracy                           0.34       120
   macro avg       0.17      0.50      0.25       120
weighted avg       0.12      0.34      0.17       120



In [16]:
#To know the f1_score value individually
from sklearn.metrics import f1_score
f1 = f1_score(Y_test,y_pred,average='weighted')
print("The f1_score value for best parameter {}:".format(grid.best_params_),f1)

The f1_score value for best parameter {'penalty': 'l2', 'solver': 'newton-cg'}: 0.17401656314699793


In [17]:
#Like f1_score, roc_auc_Score is also a value
from sklearn.metrics import roc_auc_score
roc_auc_score(Y_test,grid.predict_proba(X_test)[:,1])#[:,1] is the proportion value. This is important for this step
#[:,1] - : means consider all rows and 1 means consider first column

  f"X has feature names, but {self.__class__.__name__} was fitted without"


0.5

In [18]:
#https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter

In [21]:
#we can get input from user also
age=float(input("Age:"))
salary=float(input("Salary:"))
sex=int(input("Sex Male 0 or 1:"))

Age:27
Salary:57000
Sex Male 0 or 1:0


In [22]:
result2=grid.predict([[age,salary,sex]])#result of get input from user
print("Future Predictions {}:".format(result2))

Future Predictions [1]:
