In [1]:
import numpy as np
import pandas as pd
from google.colab import drive 
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
df = pd.read_csv('/content/gdrive/MyDrive/machine learning/dataset/Social_Network_Ads.csv')
#df.isnull().sum()
#df.duplicated().sum()
df.drop(columns=['User ID'],inplace=True)
df['Gender'].replace(to_replace=['Male','Female'],value=[1,0],inplace=True)
df.head()

Unnamed: 0,Gender,Age,EstimatedSalary,Purchased
0,1,19,19000,0
1,1,35,20000,0
2,0,26,43000,0
3,0,27,57000,0
4,1,19,76000,0


In [3]:
X = df.iloc[:,:-1].values
y = df.iloc[:,3].values
print(y.shape)
print(X.shape)

(400,)
(400, 3)


In [4]:
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score 
from sklearn.metrics import accuracy_score
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [5]:

def normalize(X):
    '''
    function to normalize feature matrix, X
    '''
    mins = np.min(X, axis = 0)
    maxs = np.max(X, axis = 0)
    rng = maxs - mins
    norm_X = 1 - ((maxs - X)/rng)
    return norm_X
  
  
def logistic_func(beta, X):
    '''
    logistic(sigmoid) function
    '''
    return 1.0/(1 + np.exp(-np.dot(X, beta.T)))
  
  
def log_gradient(beta, X, y):
    '''
    logistic gradient function
    '''
    first_calc = logistic_func(beta, X) - y.reshape(X.shape[0], -1)
    final_calc = np.dot(first_calc.T, X)
    return final_calc
  
  
def cost_func(beta, X, y):
    '''
    cost function, J
    '''
    log_func_v = logistic_func(beta, X)
    y = np.squeeze(y)
    step1 = y * np.log(log_func_v)
    step2 = (1 - y) * np.log(1 - log_func_v)
    final = -step1 - step2
    return np.mean(final)
  
  
def grad_desc(X, y, beta, lr=.01, converge_change=.01):
    '''
    gradient descent function
    '''
    cost = cost_func(beta, X, y)
    change_cost = 1
    num_iter = 1
      
    while(change_cost > converge_change):
        old_cost = cost
        beta = beta - (lr * log_gradient(beta, X, y))
        cost = cost_func(beta, X, y)
        change_cost = old_cost - cost
        num_iter += 1
      
    return beta, num_iter 
  
  
def pred_values(beta, X):
    '''
    function to predict labels
    '''
    pred_prob = logistic_func(beta, X)
    pred_value = np.where(pred_prob >= .5, 1, 0)
    return np.squeeze(pred_value)
  
  
if __name__ == "__main__":

    X = X_train
    y = y_train  
    # normalizing feature matrix
    X = normalize(X)
      
    # stacking columns with all ones in feature matrix
    X = np.hstack((np.matrix(np.ones(X.shape[0])).T, X))

    # initial beta values
    beta = np.matrix(np.zeros(X.shape[1]))
  
    # beta values after running gradient descent
    beta, num_iter = grad_desc(X, y, beta)
  
    # estimated beta values and number of iterations
    print("Estimated regression coefficients:", beta)
    print("No. of iterations:", num_iter)
    
    # predicted labels
    X_test = normalize(X_test)
    X_test = np.hstack((np.matrix(np.ones(X_test.shape[0])).T, X_test))
    print(X_test.shape)
    Y_pred = pred_values(beta, X_test)
    acc = accuracy_score(y_test,Y_pred)
    # number of correctly predicted labels

    print("Correctly predicted labels: %s" %acc)

Estimated regression coefficients: [[-6.62190849  0.44584757  8.43569811  3.56265817]]
No. of iterations: 271
(80, 4)
Correctly predicted labels: 0.875


In [6]:
X = df.iloc[:,:-1].values
X = normalize(X)
y = df.iloc[:,3].values
print(y.shape)
print(X.shape)
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix
clf = LogisticRegression()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test,y_pred)
print('accuracy: %s' %acc)

(400,)
(400, 3)
accuracy: 0.825
