In [2]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics



In [3]:
def unique_observations(dataset, header, method=1):
    """
    To get unique observations in the loaded pandas DataFrame column
    :param dataset:
    :param header:
    :param method: Method to perform the unique (default method=1 for pandas and method=0 for numpy )
    :return:
    """
    try:
        if method == 0:
            # With Numpy
            observations = np.unique(dataset[[header]])
        elif method == 1:
            # With Pandas
            observations = pd.unique(dataset[header].values.ravel())
        else:
            observations = None
            print "Wrong method type, Use 1 for pandas and 0 for numpy"
    except Exception as e:
        observations = None
        print "Error: {error_msg} /n Please check the inputs once..!".format(error_msg=e.message)
    return observations


def feature_target_frequency_relation(dataset, f_t_headers):

    """
    To get the frequency relation between targets and the unique feature observations
    :param dataset:
    :param f_t_headers: feature and target header
    :return: feature unique observations dictionary of frequency count dictionary
    """

    feature_unique_observations = unique_observations(dataset, f_t_headers[0])
    unique_targets = unique_observations(dataset, f_t_headers[1])

    frequencies = {}
    for feature in feature_unique_observations:
        frequencies[feature] = {unique_targets[0]: len(
            dataset[(dataset[f_t_headers[0]] == feature) & (dataset[f_t_headers[1]] == unique_targets[0])]),
            unique_targets[1]: len(
                dataset[(dataset[f_t_headers[0]] == feature) & (dataset[f_t_headers[1]] == unique_targets[1])])}
    return frequencies

In [8]:
DATA_SET_PATH="anes_dataset.csv"

def dataset_headers(dataset):
    return list(dataset.columns.values)

def train_logistic_regression(train_x,train_y):
    logistic_regression_model = LogisticRegression()
    logistic_regression_model.fit(train_x,train_y)
    return logistic_regression_model

def model_accuracy(trained_model, features, targets):
    """
    Get the accuracy score of the model
    :param trained_model:
    :param features:
    :param targets:
    :return:
    """
    accuracy_score = trained_model.score(features, targets)
    return accuracy_score
 

def main():
    dataset=pd.read_csv(DATA_SET_PATH)
    
    
    print "Number of Obs ::",len(dataset)
    
    training_features = ['TVnews','PID','age','educ','income']
    target = 'vote'
    
    train_x,test_x,train_y,test_y = train_test_split(dataset[training_features],dataset[target],train_size=0.7)
    print "train_x size :: ", train_x.shape
    print "train_y size :: ", train_y.shape
 
    print "test_x size :: ", test_x.shape
    print "test_y size :: ", test_y.shape
    
    headers=dataset_headers(dataset)
    print training_features
    
    print "edu_target_frequencies :: ", feature_target_frequency_relation(dataset, [training_features[0], target])
    
    trained_logistic_regression_model = train_logistic_regression(train_x, train_y)
    
 
    train_accuracy = model_accuracy(trained_logistic_regression_model, train_x, train_y)
    
    test_accuracy = model_accuracy(trained_logistic_regression_model,test_x,test_y)
    
    print "Train Accuracy :: ", train_accuracy
    print "Test Accuracy :: ", test_accuracy

if __name__ == "__main__":
    main()

Number of Obs :: 944
train_x size ::  (660, 5)
train_y size ::  (660L,)
test_x size ::  (284, 5)
test_y size ::  (284L,)
['TVnews', 'PID', 'age', 'educ', 'income']
edu_target_frequencies ::  {0: {0: 94, 1: 67}, 1: {0: 54, 1: 46}, 2: {0: 59, 1: 53}, 3: {0: 70, 1: 31}, 4: {0: 37, 1: 29}, 5: {0: 52, 1: 32}, 6: {0: 13, 1: 19}, 7: {0: 172, 1: 116}}
Train Accuracy ::  0.895454545455
Test Accuracy ::  0.926056338028
