# Analyzing Relationships with Machine Learning

By: Oscar Ko

This notebook is created to analyze this dataset on relationships from Stanford:

https://data.stanford.edu/hcmst2017

---
---

# Imports and Data

In [1]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')


imported_data = pd.read_stata("data/HCMST 2017 fresh sample for public sharing draft v1.1.dta")

imported_data.shape

(3510, 285)

In [2]:
imported_data.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3510 entries, 0 to 3509
Data columns (total 285 columns):
 #    Column                            Non-Null Count  Dtype   
---   ------                            --------------  -----   
 0    CaseID                            3510 non-null   int16   
 1    CASEID_NEW                        3510 non-null   int32   
 2    qflag                             3510 non-null   category
 3    weight1                           2994 non-null   float64 
 4    weight1_freqwt                    2994 non-null   float32 
 5    weight2                           551 non-null    float64 
 6    weight1a                          3110 non-null   float64 
 7    weight1a_freqwt                   3110 non-null   float32 
 8    weight_combo                      3510 non-null   float32 
 9    weight_combo_freqwt               3510 non-null   float32 
 10   duration                          3510 non-null   int16   
 11   speed_flag                        3510 no

# Select Specific Features to Keep

- Q4 and w6_q4 contain subject's partner's gender, but only w6_q4 contains gender for couples that are still together AND couples that are broken up.

    - w6_q4 will be selected to use. Q4 will not be used.
    
All the Q's are combined into the w6's

In [13]:
features = [
    
    # Good alone-------------------------
    
    "CASEID_NEW", # ID
    "w6_sex_frequency", # sexFrequency
    "ppp20072", # attendReligiousServiceFreq
    "pphhsize", # household size
    "pphouse", # type of house
    "ppincimp", # household income
    "ppmsacat", # metro area
    "pprent", # own, rent, other
    "ppwork", # employment status
    "w6_q15a1_truncated", # subject grew up in US?
    "w6_q15a4_truncated", # subject's living country when met partner
    "w6_q16", # how many relatives subject sees per month?
    "w6_q17", # how many times has subject been married?
    "w6_q23", # Who earned more (in 2016 or when last together)
    "interracial_5cat", # based on w6_subject_race and w6_q6b
    "w6_q32", # did you use an Internet service to meet partner?
    "age_when_met", # age when met in years,=ppage-(2017- w6_q21a_year)
    "w6_q4", # partner gender
    "partyid7", # subject's political party
    "w6_q12", # partner's political party
    
    # similar features-------------------
    
    "ppgender", # subject gender
    "female", # subject gender based on ppgender
    
    "S1", # isMarried
    "ppmarit", # marital status
    "w6_married", # married _ based on S1
    
    "ppage", # subject age
    "ppagecat", # subject age
    "ppagect4", # subject age
    
    "w6_q9", # partner's age in 2017
    
    "ppeduc", # subject education
    "ppeducat", # subject education
    "subject_yrsed", # RECODE of ppeduc (Education (Highest Degree Received))
    
    "w6_q10", # partner's education
    "partner_yrsed", # RECODE of w6_q10 (partner's educational attainment)
    
    "w6_q14", # subject's mother's education
    "subject_mother_yrsed", # RECODE of w6_q14 (Subject's mother's educational attainment)
    
    "w6_q11", # partner's mother's education
    "partner_mother_yrsed", # RECODE of w6_q11 (partner's mother's Education)

    "ppethm", # subject race / ethnicity
    "w6_subject_race", # based on single races Race_x
    
    "w6_q6a", # partner is Hispanic
    "w6_q6b", # partner's race
    
    "PPREG4", # region
    "ppreg9", # region
    
    "w6_q5", # same sex couple
    "w6_same_sex_couple", # same sex couple
    "w6_same_sex_couple_gender", # same sex couple specific (0=straight, 1=gay, 2=lesbian)
    
    "w6_attraction", # what gender(s) subject attracted to?
    
    "w6_q19", # couple living together?
    "w6_q20", # ever lived with partner?
    
    "w6_q34", # how would you describe the quality of your relationship with partner?
    "w6_relationship_quality", # quality of existing relationships based on w6_q34
    
    # to be combined---------------------
    
    "w6_identity", # sexualIdentity (straight, gay, e.t.c.)
    "w6_identity_2", # sexualIdentity (straight, gay, e.t.c.)
    "w6_identity_all", # subject sexual identity
    
    # to be filtered---------------------
    
    
    # Other: Household Ages ----------------------------
    
    "PPT01", # household member age
    "PPT25", # household member age
    "PPT612", # household member age
    "PPT1317", # household member age
    "PPT18OV", # household member age
    
    # Other: Year/Month of Relationship Stages  ---------------------------- 
    
    "w6_q21a_year", # year subject first met partner
    "w6_q21a_month", # month subject first met partner
    "w6_q21b_year", # year subject began romantic relationship w partner
    "w6_q21b_month", # month subject began romantic relationship w partner
    "w6_q21c_year", # year subject first lived with partner
    "w6_q21c_month", # month subject first lived with partner
    "w6_q21d_year", # year subject married partner
    "w6_q21d_month", # month subject married partner
    "w6_q21e_year", # year of breakup
    "w6_q21e_month", # month of breakup
    "w6_q21f_year", # year partner died
    "w6_q21f_month", # month partner died
    
    # Other: Year/Month of Relationship Stages (Fractions)  ---------------
    
    "year_fraction_met", # w6_q21a_year+((w6_q21a_month-0.5)/12)
    "year_fraction_relstart", # w6_q21b_year+((w6_q21b_month-0.5)/12)
    "time_from_met_to_rel", # year_fraction_relstart-year_fraction_met
    "year_fraction_first_cohab", # w6_q21c_year+((w6_q21c_month-0.5)/12)
    "time_from_rel_to_cohab", # year_fraction_first_cohab-year_fraction_relstart, neg reset to zero
    
    
    # Other: How met? (In-Person)  ---------------------------- 
    
    "w6_q25", # did subject and partner attend same H.S.
    "w6_q26", # did subject and partner attend same college
    "w6_q27", # did subject and partner grow up in same city or town
    "w6_q28", # did subject's parents know partner's parents before subject knew partner?
    "w6_friend_connect_1_all", # subject knew partner's friends before meeting partner
    "w6_friend_connect_2_all", # partner knew subjects friends before meeting subject
    "w6_friend_connect_3_all", # subject's friends knew partner's friends before subject and partner met
    "w6_friend_connect_4_all", # no prior connection between subject's friends and partner's friends

    # Other: Meeting people on apps  ---------------------------- 
    
    "w6_otherdate_all", # In past yr have you met anyone for dating romance or sex (not incl current partn
    "w6_how_many_all", # how many ppl did you meet (not including current partner if partnered) last yr
    "w6_otherdate_app_all", # not incl current partner (if partnered) did you use phone app last year to meet
    "w6_how_many_app_all", # how many ppl did you meet (not including partner) last year through phone apps?
    "w6_number_people_met_app", # number of people met with apps in the past year

]

df = imported_data[features]

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

df.head(3)

Unnamed: 0,CASEID_NEW,w6_sex_frequency,ppp20072,pphhsize,pphouse,ppincimp,ppmsacat,pprent,ppwork,w6_q15a1_truncated,w6_q15a4_truncated,w6_q16,w6_q17,w6_q23,interracial_5cat,w6_q32,age_when_met,w6_q4,partyid7,w6_q12,ppgender,female,S1,ppmarit,w6_married,ppage,ppagecat,ppagect4,w6_q9,ppeduc,ppeducat,subject_yrsed,w6_q10,partner_yrsed,w6_q14,subject_mother_yrsed,w6_q11,partner_mother_yrsed,ppethm,w6_subject_race,w6_q6a,w6_q6b,PPREG4,ppreg9,w6_q5,w6_same_sex_couple,w6_same_sex_couple_gender,w6_attraction,w6_q19,w6_q20,w6_q34,w6_relationship_quality,w6_identity,w6_identity_2,w6_identity_all,PPT01,PPT25,PPT612,PPT1317,PPT18OV,w6_q21a_year,w6_q21a_month,w6_q21b_year,w6_q21b_month,w6_q21c_year,w6_q21c_month,w6_q21d_year,w6_q21d_month,w6_q21e_year,w6_q21e_month,w6_q21f_year,w6_q21f_month,year_fraction_met,year_fraction_relstart,time_from_met_to_rel,year_fraction_first_cohab,time_from_rel_to_cohab,w6_q25,w6_q26,w6_q27,w6_q28,w6_friend_connect_1_all,w6_friend_connect_2_all,w6_friend_connect_3_all,w6_friend_connect_4_all,w6_otherdate_all,w6_how_many_all,w6_otherdate_app_all,w6_how_many_app_all,w6_number_people_met_app
0,2014039,,Never,1,A one-family house detached from any other house,"$40,000 to $49,999",Metro,Owned or being bought by you or someone in you...,Working - as a paid employee,United States,United States,1.0,1.0,I earned more,no,"Yes, an Internet dating or matchmaking site (l...",30.0,[Partner Name] is Male,Leans Democrat,Leans Republican,Male,0,"No, I am not Married",Divorced,no,30,25-34,30-44,26.0,Associate degree,Some college,14.0,HS graduate or GED,12.0,Associate degree,14.0,HS graduate or GED,12.0,"White, Non-Hispanic",White,No (Not Latino or Hispanic),White,Northeast,Mid-Atlantic,"Yes, we are a same-sex couple",same_sex_couple,gay male couple,sexually attracted to men and women equally,,No,,,,bisexual,bisexual,0,0,0,0,1,2017.0,March,2017.0,March,,,,,2017.0,June,,,2017.208374,2017.208374,0.0,,,Different High School,,No,No,no,no,no,yes,"Yes, I have met at least one person for dating...","Yes, I have met at least one person for dating...",1.0,,0.0
1,2019003,Once a month or less,Never,4,A one-family house detached from any other house,"$150,000 to $174,999",Metro,Owned or being bought by you or someone in you...,Not working - other,United States,United States,1.0,1.0,[Partner Name] earned more,no,"No, I did NOT meet [Partner Name] through the ...",21.0,[Partner Name] is Male,Not Strong Republican,Leans Republican,Female,1,"Yes, I am Married",Married,yes,55,55-64,45-59,52.0,Masters degree,Bachelor's degree or higher,17.0,Masters degree,17.0,Bachelors degree,16.0,HS graduate or GED,12.0,"White, Non-Hispanic",White,No (Not Latino or Hispanic),White,Midwest,East-North Central,,NOT same-sex souple,hetero couple,sexually attracted only to opposite gender,Yes,,Excellent,excellent,heterosexual or straight,,heterosexual or straight,0,0,0,2,2,1983.0,May,1995.0,August,1996.0,February,1996.0,February,,,,,1983.375,1995.625,12.25,1996.125,0.5,Different High School,2.0,No,No,no,no,no,yes,"No, I have not met anyone for dating, romance,...",,,,
2,2145527,2 to 3 times a month,Once or twice a month,5,A one-family house detached from any other house,"$200,000 to $249,999",Metro,Owned or being bought by you or someone in you...,Working - as a paid employee,"Another country, please specify","Another country, please specify",0.0,1.0,I earned more,no,"Yes, an Internet dating or matchmaking site (l...",36.0,[Partner Name] is Female,Leans Democrat,Leans Democrat,Male,0,"Yes, I am Married",Married,yes,47,45-54,45-59,45.0,Masters degree,Bachelor's degree or higher,17.0,Associate degree,14.0,7th or 8th grade,7.5,9th grade,9.0,"White, Non-Hispanic",White,No (Not Latino or Hispanic),White,South,South Atlantic,,NOT same-sex souple,hetero couple,sexually attracted only to opposite gender,Yes,,Good,good,heterosexual or straight,,heterosexual or straight,0,1,2,0,2,2006.0,January,2006.0,June,2006.0,July,2008.0,May,,,,,2006.041626,2006.458374,0.416748,2006.541626,0.083252,Different High School,2.0,No,No,no,no,no,yes,"No, I have not met anyone for dating, romance,...",,,,


In [14]:
df.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3510 entries, 0 to 3509
Data columns (total 90 columns):
 #   Column                     Non-Null Count  Dtype   
---  ------                     --------------  -----   
 0   CASEID_NEW                 3510 non-null   int32   
 1   w6_sex_frequency           2856 non-null   category
 2   ppp20072                   3394 non-null   category
 3   pphhsize                   3510 non-null   category
 4   pphouse                    3510 non-null   category
 5   ppincimp                   3510 non-null   category
 6   ppmsacat                   3510 non-null   category
 7   pprent                     3510 non-null   category
 8   ppwork                     3510 non-null   category
 9   w6_q15a1_truncated         3394 non-null   category
 10  w6_q15a4_truncated         3394 non-null   category
 11  w6_q16                     3367 non-null   float64 
 12  w6_q17                     3385 non-null   float64 
 13  w6_q23                     3394 n

In [77]:
imported_data["Q5"].unique()

[NaN, 'No, we are an opposite-sex couple', 'Yes, we are a same-sex couple']
Categories (2, object): ['Yes, we are a same-sex couple' < 'No, we are an opposite-sex couple']

In [78]:
imported_data["w6_q5"].unique()

['Yes, we are a same-sex couple', NaN, 'No, we are an opposite-sex couple']
Categories (2, object): ['Yes, we are a same-sex couple' < 'No, we are an opposite-sex couple']

In [None]:
filterNone_Q = imported_data["Q5"].isnull()
filterNone_w6 = imported_data["w6_q5"].isnull()

imported_data[filterNone_Q & filterNone_w6]



In [71]:
male_Q4 = df["Q4"] == "[Partner Name] is Male"
female_w6_q4 = df["w6_q4"] == "[Partner Name] is Female"

female_Q4 = df["Q4"] == "[Partner Name] is Female"
male_w6_q4 = df["w6_q4"] == "[Partner Name] is Male"

df[(male_Q4 & female_w6_q4) | (female_Q4 & male_w6_q4)]

Unnamed: 0,CASEID_NEW,Q4,w6_q4


In [4]:
imported_data[['Q20', 'Q20_2', 'w6_q20']].head()

Unnamed: 0,Q20,Q20_2,w6_q20
0,,No,No
1,,,
2,,,
3,,No,No
4,,,


In [5]:
imported_data[['Q20', 'Q20_2', 'w6_q20']].tail()

Unnamed: 0,Q20,Q20_2,w6_q20
3505,,,
3506,No,,No
3507,,,
3508,,,
3509,,,


In [8]:
imported_data[['w6_identity', 'w6_identity_2']].head()

Unnamed: 0,w6_identity,w6_identity_2
0,,bisexual
1,heterosexual or straight,
2,heterosexual or straight,
3,,bisexual
4,heterosexual or straight,
