In [174]:
import sys
import pandas as pd
import numpy as np
import matplotlib as mpl
import seaborn as sns
import plotly

In [175]:
contraceptive_data = pd.read_csv("contraceptive/contraceptive_for_students.csv")
contraceptive_data

Unnamed: 0,wife_age,wife_education,husband_education,num_child,wife_religion,wife_work,husband_occupation,standard_living,media_exposure,contraceptive
0,24,2,3,3,1,1,2,3,0,1
1,45,1,3,10,1,1,3,4,0,1
2,43,2,3,7,1,1,3,4,0,1
3,42,3,2,9,1,1,3,3,0,1
4,36,3,3,8,1,1,3,2,0,1
...,...,...,...,...,...,...,...,...,...,...
1468,33,4,4,2,1,0,2,4,0,3
1469,33,4,4,3,1,1,1,4,0,3
1470,39,3,3,8,1,0,1,4,0,3
1471,33,3,3,4,1,0,2,2,0,3


In [176]:
# Data Cleaning
from sklearn.feature_extraction import DictVectorizer
valid_entries = {'wife_education':[1, 2, 3, 4],'husband_education':[1, 2, 3, 4], 'wife_religion':[0,1], 'wife_work':[0,1], 'husband_occupation':[1, 2, 3, 4], 'standard_living':[1, 2, 3, 4], 'media_exposure':[0,1], 'contraceptive':[1,2,3]}
dummy_data = pd.DataFrame.from_dict({'wife_age':[0, 0, 0, 0], 'num_child':[0, 0, 0, 0], 'wife_education':[1, 2, 3, 4],'husband_education':[1, 2, 3, 4], 'wife_religion':[0,1, 0, 1], 'wife_work':[0, 1, 0, 1], 'husband_occupation':[1, 2, 3, 4], 'standard_living':[1, 2, 3, 4], 'media_exposure':[0 , 1, 0, 1]}) # For DictVectorizer

def clean_data(dataset):
    dataset = dataset.copy()
    # Remove rows violating constraints
    dataset = dataset.fillna(-1)
    dataset = dataset.astype(int)
    dataset = dataset[(dataset['wife_age']>=0)&(dataset['num_child']>=0)&(dataset['wife_education'].isin(valid_entries['wife_education']))&(dataset['husband_education'].isin(valid_entries['husband_education']))&(dataset['wife_religion'].isin(valid_entries['wife_religion']))&(dataset['wife_work'].isin(valid_entries['wife_work']))&(dataset['husband_occupation'].isin(valid_entries['husband_occupation']))&(dataset['standard_living'].isin(valid_entries['standard_living']))&(dataset['media_exposure'].isin(valid_entries['media_exposure']))&(dataset['contraceptive'].isin(valid_entries['contraceptive']))]
    return dataset

def split_x_y(dataset, response):
    y = np.array(dataset[response])
    x = dataset.drop(response, axis=1)
    return (x, y)

def one_hot_encode_column(dataset, column_name):
    """
    One-hot-encoder
    """
    dataset = dataset.copy()
    vec_enc = DictVectorizer()
    vec_enc.fit(dataset[[column_name]].to_dict(orient='records'))
    ohe_data = vec_enc.transform(dataset[[column_name]].to_dict(orient='records')).toarray()
    ohe_cats = vec_enc.get_feature_names()
    ohe = pd.DataFrame(ohe_data, columns=ohe_cats)
    dataset = pd.concat([dataset, ohe], axis=1)
    dataset = dataset.drop(column_name, axis=1)
    return dataset

def one_hot_encode(dataset, columns, dummy=None):
    # One-hot encode categorical variables
    dataset = dataset.copy()
    if dummy is not None:
        dataset = dataset.append(dummy, ignore_index=True)
    dataset[columns] = dataset[columns].astype(str)
    for column in columns:
        dataset = one_hot_encode_column(dataset, column)
    if dummy is not None:
        dataset = dataset.iloc[:len(dummy)]
    return dataset

def standardize(dataset, columns):
    # Standardize numerical columns
    dataset = dataset.copy()
    for column in columns:
        if dataset[column].std() > 0:
            dataset[column]=(dataset[column]-dataset[column].mean())/dataset[column].std()
    return dataset

def process_data(dataset, ohe_columns, numerical_columns, response, dummy=None):
    dataset = dataset.copy()
    dataset = clean_data(dataset)
    x, y = split_x_y(dataset, response)
    x = standardize(x, numerical_columns)
    x = one_hot_encode(x, ohe_columns)
    return (x, y)

In [181]:
# Data Splitting
from sklearn.model_selection import train_test_split
cleaned_data = clean_data(contraceptive_data) # Clean data, do not OHE or Standardize until modeling -> EDA first
train, test = train_test_split(cleaned_data, test_size=0.1) # Work only with train data during EDA
train, val = train_test_split(train, test_size=0.1)
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)
val = val.reset_index(drop=True)

In [182]:
# Exploratory Data Analysis - Needs Work
numerical = ['wife_age', 'num_child']
categorical = ['wife_education', 'husband_education', 'wife_religion', 'wife_work', 'husband_occupation', 'standard_living', 'media_exposure']
ordinal = ['wife_education', 'husband_education', 'standard_living']
nominal = ['wife_religion', 'wife_work', 'husband_occupation', 'media_exposure']
all_vars = numerical + categorical

In [183]:
# Model Generation - classification task -> Logistic Regression CV, Decision Trees, Random Forest, more?
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
x_train, y_train = process_data(train, categorical, numerical, 'contraceptive', dummy_data)
logistic_model = LogisticRegressionCV(multi_class='ovr')
logistic_model.fit(x_train, y_train)
decisionTree_model = DecisionTreeClassifier()
decisionTree_model.fit(x_train, y_train)
randomForest_model = RandomForestClassifier()
randomForest_model.fit(x_train, y_train)
knn_model = KNeighborsClassifier()
n_neighbors = {'n_neighbors':np.arange(1, 26)}
knn_model_gscv = GridSearchCV(knn_model, n_neighbors, cv=5)
knn_model_gscv.fit(x_train, y_train)
svm_model = SVC()
param_grid = {'C':np.linspace(1, 10, 25)}
svm_model_gscv = GridSearchCV(svm_model, param_grid, cv=5)
svm_model_gscv.fit(x_train, y_train)
models = [logistic_model, decisionTree_model, randomForest_model, knn_model_gscv, svm_model_gscv]
train_scores = [model.score(x_train, y_train) for model in models]
print(train_scores)
x_val, y_val = process_data(val, categorical, numerical, 'contraceptive', dummy_data)
val_scores = [model.score(x_val, y_val) for model in models]
print(val_scores)
chosen_model = models[np.argmax(val_scores)]

[0.5226510067114094, 0.962248322147651, 0.962248322147651, 0.5771812080536913, 0.6409395973154363]
[0.5864661654135338, 0.518796992481203, 0.5112781954887218, 0.5413533834586466, 0.5639097744360902]


In [184]:
# Model Testing
x_test, y_test = process_data(test, categorical, numerical, 'contraceptive', dummy_data)
chosen_model.score(x_test, y_test)

0.5405405405405406

In [None]:
# Interpret Results