In [1]:
import pandas as pd
import numpy as np
import codecs

I'm going to predict what is user's sex (male, female or undefined).
To do that I'll use such features as age, report_weight, and current_height

In [5]:
file_basic = 'report_for_sergey_csv.csv'

In [6]:
df = pd.read_csv(file_basic)
df.head()

Unnamed: 0,user_id,Count,report_weight,Last_update_date,Date,date_of_birth,sex,current_height,current_weight,diet_type,activity_level
0,15,55,61,1452204000,1452204000,517953600.0,0,168,60.8,0.0,1
1,15,55,64,1452290400,1452290400,517953600.0,0,168,60.8,0.0,1
2,15,55,62,1452376800,1452376800,517953600.0,0,168,60.8,0.0,1
3,15,55,61,1452463200,1452463200,517953600.0,0,168,60.8,0.0,1
4,15,55,61,1455228000,1452549600,517953600.0,0,168,60.8,0.0,1


In [7]:
cols_to_select = ['user_id', 'date_of_birth', 'report_weight', 'current_height', 'sex' ]
df = df[cols_to_select]

In [8]:
df_sorted = df.groupby(['user_id']).first()

In [10]:
df_sorted.head()

Unnamed: 0_level_0,date_of_birth,report_weight,current_height,sex
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
15,517953600.0,61,168,0
31,130791600.0,69,164,0
60,572551200.0,53,157,0
87,799275600.0,51,165,1
113,338158800.0,67,165,0


In [60]:
df_train = df_sorted.sample(n=int(len(df_sorted.index)*0.2)).fillna(0)

In [76]:
df_train['date_of_birth'] = pd.to_datetime(df_train['date_of_birth'], unit='s').dt.year

In [77]:
data = df_train.loc[:, ['date_of_birth', 'report_weight', 'current_height']].values

In [78]:
y = df_train.loc[:, ['sex']].values
y = y[:, 0]

In [63]:
y.shape

(134,)

In [79]:
X = np.hstack((data, np.ones((data.shape[0], 1), dtype=int)))

In [80]:
X.shape
m, n = X.shape[0], X.shape[1]

In [81]:
(m, n)

(134, 4)

In [82]:
initial_theta = np.zeros(n)
initial_theta.shape

(4,)

In [83]:
z = initial_theta @ X.T

In [84]:
z.shape

(134,)

In [85]:
# sigmoid function
def sigmoid(z):
    """
    Compute element-wise sigmoid function given the input z.
    Parameters
        z => array_like. This can be a 1-D vector or a 2-D matrix
    Returns
        g => array_like, which has the same shape as z
    """
    z = np.array(z)
    g = np.zeros(z.shape)
    g = 1 / (1 + np.exp(-z))
    return g

In [86]:
# cost function
def costFunctionLogisticRegression(theta, X, y):
    """Compute cost and gradient for logistic regression.
    Parameters:
    theta => array_like, the parameters for logistic regression.
            Which is a vector of shape (n+1, ), where n is the number of features.
            n shall be increased by 1 due to using 'intersept' feature (column with ones)
    X => array_like, the input dataset of shape (m, n+1), where
            m is the total number of instances (data points)
            n is the number of features, the intercept has to be added beforehand
    y => array_like, labels for the input, which is a vector of shape (m, )  
    Returns
    -------
    J : float
        The computed value for the cost function.     
    grad : array_like
        A vector of shape (n+1, ) which is the gradient of the cost
        function with respect to theta, at the current values of theta.
    """
    J = 0
    m = y.size
    grad = np.zeros(theta.shape)
    #==================
    z = theta @ X.T
    prediction = sigmoid(z.T)
    error = -y @ np.log(prediction) - (1 - y) @ np.log(1 - prediction)
    J = (1 / m) * error
    # compute the partial derivatives of the cost with regards to each parameter in theta
    grad = (1 / m ) * ((prediction - y) @ X) 
    
    return J, grad

In [87]:
cost, grad = costFunctionLogisticRegression(initial_theta, X, y)
cost

0.69314718055994529

In [88]:
grad

array([  3.27399254e+02,   7.74253731e+00,   2.68208955e+01,
         1.64179104e-01])

In [94]:
from scipy import optimize

In [96]:
# set options for optimize.minimize
options = {'maxiter': 400}
res = optimize.minimize(costFunctionLogisticRegression,
                        initial_theta,
                        (X, y),
                        jac=True,
                        method='TNC',
                        options=options)
# the fun property is the cost
cost = res.fun
# the optimized theta is in the x property
theta = res.x
print('theta', theta, '\n',
         'cost', cost)

theta [-0.00605901  0.02174216  0.05362581  0.85858352] 
 cost 0.584953133199
