# Analyzing Relationships with Machine Learning

By: Oscar Ko

This notebook is created to analyze this dataset on relationships from Stanford:

https://data.stanford.edu/hcmst2017

---
---

# Imports and Data

In [1]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')


imported_data = pd.read_stata("data/HCMST 2017 fresh sample for public sharing draft v1.1.dta")

imported_data.shape

(3510, 285)

In [2]:
imported_data.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3510 entries, 0 to 3509
Data columns (total 285 columns):
 #    Column                            Non-Null Count  Dtype   
---   ------                            --------------  -----   
 0    CaseID                            3510 non-null   int16   
 1    CASEID_NEW                        3510 non-null   int32   
 2    qflag                             3510 non-null   category
 3    weight1                           2994 non-null   float64 
 4    weight1_freqwt                    2994 non-null   float32 
 5    weight2                           551 non-null    float64 
 6    weight1a                          3110 non-null   float64 
 7    weight1a_freqwt                   3110 non-null   float32 
 8    weight_combo                      3510 non-null   float32 
 9    weight_combo_freqwt               3510 non-null   float32 
 10   duration                          3510 non-null   int16   
 11   speed_flag                        3510 no

# Select Specific Features to Keep

- Q4 and w6_q4 contain subject's partner's gender, but only w6_q4 contains gender for couples that are still together AND couples that are broken up.

    - w6_q4 will be selected to use. Q4 will not be used.
    
All the Q's are combined into the w6's

In [10]:
features = [
    

    "CASEID_NEW", # ID
    "w6_sex_frequency", # sexFrequency
    "ppp20072", # attendReligiousServiceFreq
    "pphhsize", # household size
    "pphouse", # type of house
    "ppincimp", # household income
    "ppmsacat", # metro area
    "pprent", # own, rent, other
    "ppwork", # employment status
    "w6_q15a1_truncated", # subject grew up in US?
    "w6_q15a4_truncated", # subject's living country when met partner
    "w6_q16", # how many relatives subject sees per month?
    "w6_q17", # how many times has subject been married?
    "w6_q23", # Who earned more (in 2016 or when last together)
    "interracial_5cat", # based on w6_subject_race and w6_q6b
    "w6_q32", # did you use an Internet service to meet partner?
    "age_when_met", # age when met in years,=ppage-(2017- w6_q21a_year)
    "w6_q4", # partner gender
    "partyid7", # subject's political party
    "w6_q12", # partner's political party
    "ppgender", # subject gender
    "S1", # isMarried
    "ppage", # subject age
    "w6_q9", # partner's age in 2017
    "ppeducat", # subject education
    "subject_yrsed", # RECODE of ppeduc (Education (Highest Degree Received))
    "partner_yrsed", # RECODE of w6_q10 (partner's educational attainment)
    "partner_mother_yrsed", # RECODE of w6_q11 (partner's mother's Education)
    "w6_subject_race", # based on single races Race_x
    "w6_q6b", # partner's race
    "PPREG4", # region
    "w6_same_sex_couple_gender", # same sex couple specific (0=straight, 1=gay, 2=lesbian)
    "w6_attraction", # what gender(s) subject attracted to?
    "w6_q19", # couple living together?
    "w6_q34", # how would you describe the quality of your relationship with partner?
    "w6_identity_all", # subject sexual identity
    "PPT01", # household member age (number of babies in household ages 0-1)
    "PPT25", # household member age (number of toddlers in household ages 2-5)
    "PPT612", # household member age (number of children in household ages 6-12)
    "PPT1317", # household member age (number of teens in household ages 13-17)
    "PPT18OV", # household member age (number of adults in household ages 18+)
    
    # Year/Month of Relationship Stages  ---------------------------- 
    
    "w6_q21a_year", # year subject first met partner
    "w6_q21a_month", # month subject first met partner
    "w6_q21b_year", # year subject began romantic relationship w partner
    "w6_q21b_month", # month subject began romantic relationship w partner
    "w6_q21c_year", # year subject first lived with partner
    "w6_q21c_month", # month subject first lived with partner
    "w6_q21d_year", # year subject married partner
    "w6_q21d_month", # month subject married partner
    "w6_q21e_year", # year of breakup
    "w6_q21e_month", # month of breakup
    "w6_q21f_year", # year partner died
    "w6_q21f_month", # month partner died
    
    # (Fractions) Year/Month of Relationship Stages ---------------
    
    "year_fraction_met", # w6_q21a_year+((w6_q21a_month-0.5)/12)
    "year_fraction_relstart", # w6_q21b_year+((w6_q21b_month-0.5)/12)
    "time_from_met_to_rel", # year_fraction_relstart-year_fraction_met
    "year_fraction_first_cohab", # w6_q21c_year+((w6_q21c_month-0.5)/12)
    "time_from_rel_to_cohab", # year_fraction_first_cohab-year_fraction_relstart, neg reset to zero
    
    
    # Met in person -----------------------------------------------
    
    "w6_q25", # did subject and partner attend same H.S.
    "w6_q26", # did subject and partner attend same college
    "w6_q27", # did subject and partner grow up in same city or town
    "w6_q28", # did subject's parents know partner's parents before subject knew partner?
    "w6_friend_connect_1_all", # subject knew partner's friends before meeting partner
    "w6_friend_connect_2_all", # partner knew subjects friends before meeting subject
    "w6_friend_connect_3_all", # subject's friends knew partner's friends before subject and partner met
    "w6_friend_connect_4_all", # no prior connection between subject's friends and partner's friends
    
    "hcm2017q24_R_cowork", # Respondent's coworker: indermediary or partner
    "hcm2017q24_R_friend", # Respondent's friend: intermediary
    "hcm2017q24_R_family", # Respondent's family: intermediary
    "hcm2017q24_R_sig_other", # Respondent's (current or past) Significant Other: Intermediary
    "hcm2017q24_R_neighbor", # Respondent's residential neighbor: intermediary or Partner
    "hcm2017q24_P_cowork", # Partner's coworker: Intermediary or Respondent
    "hcm2017q24_P_friend", # Partner's Friend: Intermediary
    "hcm2017q24_P_family", # Partner's Family: Intermediary
    "hcm2017q24_P_sig_other", # Partner's (current or past) Significant Other: Intermediary
    "hcm2017q24_P_neighbor", # Partner's residential neighbor: Intermediary or Respondent
    
    "hcm2017q24_met_through_family", # 1 if R_family or P_family =1
    "hcm2017q24_met_through_friend", # 1 if R_friend or P_friend=1
    "hcm2017q24_met_through_as_nghbrs", # 1 if R_neighbor or P_neighbor=1
    "hcm2017q24_met_as_through_cowork", # 1 if R_cowork or P_cowork=1

    "hcm2017q24_school", # met in primary or secondary school
    "hcm2017q24_college", # met in college
    "hcm2017q24_mil", # met during military service
    "hcm2017q24_church", # met in or through church or religious organization
    "hcm2017q24_vol_org", # met through voluntary organization (non-church)
    "hcm2017q24_customer", # customer-client relationship
    "hcm2017q24_bar_restaurant", # restaurant, or othe public social gathering place
    "hcm2017q24_party", # private party
    
    "hcm2017q24_public", # met in public place
    "hcm2017q24_blind_date", # met on blind date
    "hcm2017q24_vacation", # met while on vacation
    "hcm2017q24_single_serve_nonint", # non internet single service
    "hcm2017q24_business_trip", # met while on business trip
    "hcm2017q24_work_neighbors", # met as work neighbors
    
    # Met online / dating app -----------------------------------------------
    
    "hcm2017q24_internet_other", # Internet, not otherwise classified
    "hcm2017q24_internet_dating", # met through Internet dating or phone app
    "hcm2017q24_internet_soc_network", # met through internet social networking
    
    "hcm2017q24_internet_game", # met through online gaming
    "hcm2017q24_internet_chat", # met through Internet chat
    "hcm2017q24_internet_org", # met through Internet site not mainly dedicated to dating

    "hcm2017q24_met_online", # met online, all kinds


    # to be filtered ---------------------
    
    "qflag", # DOV: Qualification Flag - Remove 2
    "speed_flag", # Respondents who completed survey in under 2 min - Remove under 2
    "S3", # Ever had a boyfriend or a girlfriend - Remove "No"
    "w6_took_the_survey", # Whether subject took the survey or was excluded
    "partnership_status", # Filter out 4 (never had) married, parner, ex
    
    
    # to be recoded ----------------------
    
    "w6_q10", # partner's education # RECODE THIS TO MATCH PPEDUCAT
    "w6_q11", # partner's mother's education # RECODE THIS TO MATCH PPEDUCAT
    
    "ppethm", # subject is Hispanic -- convert to binary
    "w6_q6a", # partner is Hispanic -- convert to binary
    

]

df = imported_data[features]

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

df.head(3)

Unnamed: 0,CASEID_NEW,w6_sex_frequency,ppp20072,pphhsize,pphouse,ppincimp,ppmsacat,pprent,ppwork,w6_q15a1_truncated,w6_q15a4_truncated,w6_q16,w6_q17,w6_q23,interracial_5cat,w6_q32,age_when_met,w6_q4,partyid7,w6_q12,ppgender,S1,ppage,w6_q9,ppeducat,subject_yrsed,partner_yrsed,partner_mother_yrsed,w6_subject_race,w6_q6b,PPREG4,w6_same_sex_couple_gender,w6_attraction,w6_q19,w6_q34,w6_identity_all,PPT01,PPT25,PPT612,PPT1317,PPT18OV,w6_q21a_year,w6_q21a_month,w6_q21b_year,w6_q21b_month,w6_q21c_year,w6_q21c_month,w6_q21d_year,w6_q21d_month,w6_q21e_year,w6_q21e_month,w6_q21f_year,w6_q21f_month,year_fraction_met,year_fraction_relstart,time_from_met_to_rel,year_fraction_first_cohab,time_from_rel_to_cohab,w6_q25,w6_q26,w6_q27,w6_q28,w6_friend_connect_1_all,w6_friend_connect_2_all,w6_friend_connect_3_all,w6_friend_connect_4_all,hcm2017q24_R_cowork,hcm2017q24_R_friend,hcm2017q24_R_family,hcm2017q24_R_sig_other,hcm2017q24_R_neighbor,hcm2017q24_P_cowork,hcm2017q24_P_friend,hcm2017q24_P_family,hcm2017q24_P_sig_other,hcm2017q24_P_neighbor,hcm2017q24_met_through_family,hcm2017q24_met_through_friend,hcm2017q24_met_through_as_nghbrs,hcm2017q24_met_as_through_cowork,hcm2017q24_school,hcm2017q24_college,hcm2017q24_mil,hcm2017q24_church,hcm2017q24_vol_org,hcm2017q24_customer,hcm2017q24_bar_restaurant,hcm2017q24_party,hcm2017q24_public,hcm2017q24_blind_date,hcm2017q24_vacation,hcm2017q24_single_serve_nonint,hcm2017q24_business_trip,hcm2017q24_work_neighbors,hcm2017q24_internet_other,hcm2017q24_internet_dating,hcm2017q24_internet_soc_network,hcm2017q24_internet_game,hcm2017q24_internet_chat,hcm2017q24_internet_org,hcm2017q24_met_online,qflag,speed_flag,S3,w6_took_the_survey,partnership_status,w6_q10,w6_q11,ppethm,w6_q6a
0,2014039,,Never,1,A one-family house detached from any other house,"$40,000 to $49,999",Metro,Owned or being bought by you or someone in you...,Working - as a paid employee,United States,United States,1.0,1.0,I earned more,no,"Yes, an Internet dating or matchmaking site (l...",30.0,[Partner Name] is Male,Leans Democrat,Leans Republican,Male,"No, I am not Married",30,26.0,Some college,14.0,12.0,12.0,White,White,Northeast,gay male couple,sexually attracted to men and women equally,,,bisexual,0,0,0,0,1,2017.0,March,2017.0,March,,,,,2017.0,June,,,2017.208374,2017.208374,0.0,,,Different High School,,No,No,no,no,no,yes,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,yes,no,no,no,no,yes,Qualified,Completed survey in over 2 minutes,Yes,took the survey,"unpartnered, has had past partner",HS graduate or GED,HS graduate or GED,"White, Non-Hispanic",No (Not Latino or Hispanic)
1,2019003,Once a month or less,Never,4,A one-family house detached from any other house,"$150,000 to $174,999",Metro,Owned or being bought by you or someone in you...,Not working - other,United States,United States,1.0,1.0,[Partner Name] earned more,no,"No, I did NOT meet [Partner Name] through the ...",21.0,[Partner Name] is Male,Not Strong Republican,Leans Republican,Female,"Yes, I am Married",55,52.0,Bachelor's degree or higher,17.0,17.0,12.0,White,White,Midwest,hetero couple,sexually attracted only to opposite gender,Yes,Excellent,heterosexual or straight,0,0,0,2,2,1983.0,May,1995.0,August,1996.0,February,1996.0,February,,,,,1983.375,1995.625,12.25,1996.125,0.5,Different High School,2.0,No,No,no,no,no,yes,yes,no,no,no,no,yes,no,no,no,no,no,no,no,yes,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,Qualified,Completed survey in over 2 minutes,,took the survey,married,Masters degree,HS graduate or GED,"White, Non-Hispanic",No (Not Latino or Hispanic)
2,2145527,2 to 3 times a month,Once or twice a month,5,A one-family house detached from any other house,"$200,000 to $249,999",Metro,Owned or being bought by you or someone in you...,Working - as a paid employee,"Another country, please specify","Another country, please specify",0.0,1.0,I earned more,no,"Yes, an Internet dating or matchmaking site (l...",36.0,[Partner Name] is Female,Leans Democrat,Leans Democrat,Male,"Yes, I am Married",47,45.0,Bachelor's degree or higher,17.0,14.0,9.0,White,White,South,hetero couple,sexually attracted only to opposite gender,Yes,Good,heterosexual or straight,0,1,2,0,2,2006.0,January,2006.0,June,2006.0,July,2008.0,May,,,,,2006.041626,2006.458374,0.416748,2006.541626,0.083252,Different High School,2.0,No,No,no,no,no,yes,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,yes,no,no,no,no,no,no,no,yes,no,no,no,no,no,yes,Qualified,Completed survey in over 2 minutes,,took the survey,married,Associate degree,9th grade,"White, Non-Hispanic",No (Not Latino or Hispanic)


In [5]:
df.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3510 entries, 0 to 3509
Data columns (total 115 columns):
 #    Column                            Non-Null Count  Dtype   
---   ------                            --------------  -----   
 0    CASEID_NEW                        3510 non-null   int32   
 1    w6_sex_frequency                  2856 non-null   category
 2    ppp20072                          3394 non-null   category
 3    pphhsize                          3510 non-null   category
 4    pphouse                           3510 non-null   category
 5    ppincimp                          3510 non-null   category
 6    ppmsacat                          3510 non-null   category
 7    pprent                            3510 non-null   category
 8    ppwork                            3510 non-null   category
 9    w6_q15a1_truncated                3394 non-null   category
 10   w6_q15a4_truncated                3394 non-null   category
 11   w6_q16                            3367 no

In [77]:
imported_data["Q5"].unique()

[NaN, 'No, we are an opposite-sex couple', 'Yes, we are a same-sex couple']
Categories (2, object): ['Yes, we are a same-sex couple' < 'No, we are an opposite-sex couple']

In [78]:
imported_data["w6_q5"].unique()

['Yes, we are a same-sex couple', NaN, 'No, we are an opposite-sex couple']
Categories (2, object): ['Yes, we are a same-sex couple' < 'No, we are an opposite-sex couple']

In [None]:
filterNone_Q = imported_data["Q5"].isnull()
filterNone_w6 = imported_data["w6_q5"].isnull()

imported_data[filterNone_Q & filterNone_w6]



In [71]:
male_Q4 = df["Q4"] == "[Partner Name] is Male"
female_w6_q4 = df["w6_q4"] == "[Partner Name] is Female"

female_Q4 = df["Q4"] == "[Partner Name] is Female"
male_w6_q4 = df["w6_q4"] == "[Partner Name] is Male"

df[(male_Q4 & female_w6_q4) | (female_Q4 & male_w6_q4)]

Unnamed: 0,CASEID_NEW,Q4,w6_q4


In [4]:
imported_data[['Q20', 'Q20_2', 'w6_q20']].head()

Unnamed: 0,Q20,Q20_2,w6_q20
0,,No,No
1,,,
2,,,
3,,No,No
4,,,


In [5]:
imported_data[['Q20', 'Q20_2', 'w6_q20']].tail()

Unnamed: 0,Q20,Q20_2,w6_q20
3505,,,
3506,No,,No
3507,,,
3508,,,
3509,,,


In [8]:
imported_data[['w6_identity', 'w6_identity_2']].head()

Unnamed: 0,w6_identity,w6_identity_2
0,,bisexual
1,heterosexual or straight,
2,heterosexual or straight,
3,,bisexual
4,heterosexual or straight,


In [4]:
imported_data[['ppgender', 'female']].head()

Unnamed: 0,ppgender,female
0,Male,0
1,Female,1
2,Male,0
3,Female,1
4,Female,1


In [6]:
imported_data[['S1', 'ppmarit', "w6_married"]].head()

Unnamed: 0,S1,ppmarit,w6_married
0,"No, I am not Married",Divorced,no
1,"Yes, I am Married",Married,yes
2,"Yes, I am Married",Married,yes
3,"No, I am not Married",Never married,no
4,"Yes, I am Married",Married,yes


In [7]:
imported_data[['ppage', 'ppagecat', "ppagect4"]].head()

Unnamed: 0,ppage,ppagecat,ppagect4
0,30,25-34,30-44
1,55,55-64,45-59
2,47,45-54,45-59
3,28,25-34,18-29
4,59,55-64,45-59


In [10]:
imported_data[['ppeduc', 'ppeducat', "subject_yrsed"]].head()

Unnamed: 0,ppeduc,ppeducat,subject_yrsed
0,Associate degree,Some college,14.0
1,Masters degree,Bachelor's degree or higher,17.0
2,Masters degree,Bachelor's degree or higher,17.0
3,12th grade NO DIPLOMA,Less than high school,12.0
4,Bachelors degree,Bachelor's degree or higher,16.0


In [12]:
imported_data[['w6_q10', 'partner_yrsed']].head(10)

Unnamed: 0,w6_q10,partner_yrsed
0,HS graduate or GED,12.0
1,Masters degree,17.0
2,Associate degree,14.0
3,HS graduate or GED,12.0
4,Bachelors degree,16.0
5,Bachelors degree,16.0
6,"Some college, no degree",13.0
7,Professional or Doctorate degree,20.0
8,,
9,12th grade no diploma,12.0


In [17]:
imported_data[['ppethm', 'w6_subject_race']].head(10)

Unnamed: 0,ppethm,w6_subject_race
0,"White, Non-Hispanic",White
1,"White, Non-Hispanic",White
2,"White, Non-Hispanic",White
3,"White, Non-Hispanic",White
4,"White, Non-Hispanic",White
5,"White, Non-Hispanic",White
6,"White, Non-Hispanic",White
7,"White, Non-Hispanic",White
8,"White, Non-Hispanic",White
9,Hispanic,Other (please specify)


In [21]:
print(imported_data['ppethm'].unique())

print(imported_data['w6_subject_race'].unique())

['White, Non-Hispanic', 'Hispanic', 'Black, Non-Hispanic', '2+ Races, Non-Hispanic', 'Other, Non-Hispanic']
Categories (5, object): ['White, Non-Hispanic' < 'Black, Non-Hispanic' < 'Other, Non-Hispanic' < 'Hispanic' < '2+ Races, Non-Hispanic']
['White', 'Other (please specify)', 'Black or African American', 'Asian or Pacific Islander', 'American Indian, Aleut, or Eskimo', NaN]
Categories (5, object): ['White' < 'Black or African American' < 'American Indian, Aleut, or Eskimo' < 'Asian or Pacific Islander' < 'Other (please specify)']


In [24]:
print(list(imported_data['w6_q6a'].unique()))

print(list(imported_data['w6_q6b'].unique()))

['No (Not Latino or Hispanic)', nan, 'Yes, Mexican, Mexican American, Chicano', 'Yes, Other Latino/Hispanic', 'Yes, Puerto Rican', 'Yes, Cuban', 'Refused']
['White', nan, 'Other (please specify)', 'Asian or Pacific Islander', 'Black or African American', 'American Indian, Aleut, or Eskimo', 'Refused']


In [26]:
imported_data[['w6_q5', 'w6_same_sex_couple', 'w6_same_sex_couple_gender']].head(10)

Unnamed: 0,w6_q5,w6_same_sex_couple,w6_same_sex_couple_gender
0,"Yes, we are a same-sex couple",same_sex_couple,gay male couple
1,,NOT same-sex souple,hetero couple
2,,NOT same-sex souple,hetero couple
3,"Yes, we are a same-sex couple",same_sex_couple,lesbian couple
4,,NOT same-sex souple,hetero couple
5,"No, we are an opposite-sex couple",NOT same-sex souple,hetero couple
6,,NOT same-sex souple,hetero couple
7,,NOT same-sex souple,hetero couple
8,,,
9,,NOT same-sex souple,hetero couple


In [30]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

imported_data[["w6_attraction"]].head(10)

Unnamed: 0,w6_attraction
0,sexually attracted to men and women equally
1,sexually attracted only to opposite gender
2,sexually attracted only to opposite gender
3,"sexually attracted mostly to same gender, some..."
4,"sexually attracted mostly to opposite gender, ..."
5,sexually attracted only to same gender
6,sexually attracted only to opposite gender
7,sexually attracted only to opposite gender
8,
9,sexually attracted only to opposite gender


In [32]:
print(list(imported_data["w6_attraction"].unique()))

['sexually attracted to men and women equally', 'sexually attracted only to opposite gender', 'sexually attracted mostly to same gender, sometimes opposite gender', 'sexually attracted mostly to opposite gender, sometimes same gender', 'sexually attracted only to same gender', nan]


In [34]:
imported_data[['w6_q19', 'w6_q20']].head()

Unnamed: 0,w6_q19,w6_q20
0,,No
1,Yes,
2,Yes,
3,,No
4,Yes,


In [36]:
imported_data[['w6_q34', 'w6_relationship_quality']].tail()

Unnamed: 0,w6_q34,w6_relationship_quality
3505,Excellent,excellent
3506,Good,good
3507,Good,good
3508,Good,good
3509,Fair,fair


In [39]:
imported_data[["w6_identity", "w6_identity_2", "w6_identity_all"]].head(10)

Unnamed: 0,w6_identity,w6_identity_2,w6_identity_all
0,,bisexual,bisexual
1,heterosexual or straight,,heterosexual or straight
2,heterosexual or straight,,heterosexual or straight
3,,bisexual,bisexual
4,heterosexual or straight,,heterosexual or straight
5,bisexual,,bisexual
6,heterosexual or straight,,heterosexual or straight
7,heterosexual or straight,,heterosexual or straight
8,,,
9,,heterosexual or straight,heterosexual or straight
