# Analyzing Relationships with Machine Learning

By: Oscar Ko

This notebook is created to analyze this dataset on relationships from Stanford:

https://data.stanford.edu/hcmst2017

---
---

# Imports and Data

In [82]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')


imported_data = pd.read_stata("data/HCMST 2017 fresh sample for public sharing draft v1.1.dta")

imported_data.shape

(3510, 285)

In [83]:
imported_data.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3510 entries, 0 to 3509
Data columns (total 285 columns):
 #    Column                            Non-Null Count  Dtype   
---   ------                            --------------  -----   
 0    CaseID                            3510 non-null   int16   
 1    CASEID_NEW                        3510 non-null   int32   
 2    qflag                             3510 non-null   category
 3    weight1                           2994 non-null   float64 
 4    weight1_freqwt                    2994 non-null   float32 
 5    weight2                           551 non-null    float64 
 6    weight1a                          3110 non-null   float64 
 7    weight1a_freqwt                   3110 non-null   float32 
 8    weight_combo                      3510 non-null   float32 
 9    weight_combo_freqwt               3510 non-null   float32 
 10   duration                          3510 non-null   int16   
 11   speed_flag                        3510 no

# Select Specific Features to Keep

- Q4 and w6_q4 contain subject's partner's gender, but only w6_q4 contains gender for couples that are still together AND couples that are broken up.

    - w6_q4 will be selected to use. Q4 will not be used.

In [84]:
features = ["CASEID_NEW",
           "w6_q4",
           "w6_q5",
           "w6_q6a",
           "w6_q6b",
           "w6_q9",
           "w6_q10",
           "w6_q11",
           "w6_q12",
           "w6_q14"]

df = imported_data[features]

df.head()

Unnamed: 0,CASEID_NEW,w6_q4,w6_q5,w6_q6a,w6_q6b,w6_q9,w6_q10,w6_q11,w6_q12,w6_q14
0,2014039,[Partner Name] is Male,"Yes, we are a same-sex couple",No (Not Latino or Hispanic),White,26.0,HS graduate or GED,HS graduate or GED,Leans Republican,Associate degree
1,2019003,[Partner Name] is Male,,No (Not Latino or Hispanic),White,52.0,Masters degree,HS graduate or GED,Leans Republican,Bachelors degree
2,2145527,[Partner Name] is Female,,No (Not Latino or Hispanic),White,45.0,Associate degree,9th grade,Leans Democrat,7th or 8th grade
3,2648857,[Partner Name] is Female,"Yes, we are a same-sex couple",No (Not Latino or Hispanic),White,26.0,HS graduate or GED,Bachelors degree,Undecided/Independent/Other,HS graduate or GED
4,2623465,[Partner Name] is Male,,No (Not Latino or Hispanic),White,59.0,Bachelors degree,Associate degree,Strong Democrat,Masters degree


In [77]:
imported_data["Q5"].unique()

[NaN, 'No, we are an opposite-sex couple', 'Yes, we are a same-sex couple']
Categories (2, object): ['Yes, we are a same-sex couple' < 'No, we are an opposite-sex couple']

In [78]:
imported_data["w6_q5"].unique()

['Yes, we are a same-sex couple', NaN, 'No, we are an opposite-sex couple']
Categories (2, object): ['Yes, we are a same-sex couple' < 'No, we are an opposite-sex couple']

In [None]:
filterNone_Q = imported_data["Q5"].isnull()
filterNone_w6 = imported_data["w6_q5"].isnull()

imported_data[filterNone_Q & filterNone_w6]



In [71]:
male_Q4 = df["Q4"] == "[Partner Name] is Male"
female_w6_q4 = df["w6_q4"] == "[Partner Name] is Female"

female_Q4 = df["Q4"] == "[Partner Name] is Female"
male_w6_q4 = df["w6_q4"] == "[Partner Name] is Male"

df[(male_Q4 & female_w6_q4) | (female_Q4 & male_w6_q4)]

Unnamed: 0,CASEID_NEW,Q4,w6_q4
