In [1]:
# import packages
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import metrics
import statsmodels.api as sm

pd.options.display.max_rows = 1000 # show truncated results

In [2]:
# read data (csv)
data = pd.read_csv('../data/data.csv', encoding="ISO-8859-1") # error-free encoding
data.head(5)

Unnamed: 0,iid,id,gender,idg,condtn,wave,round,position,positin1,order,...,attr3_3,sinc3_3,intel3_3,fun3_3,amb3_3,attr5_3,sinc5_3,intel5_3,fun5_3,amb5_3
0,1,1.0,0,1,1,1,10,7,,4,...,5.0,7.0,7.0,7.0,7.0,,,,,
1,1,1.0,0,1,1,1,10,7,,3,...,5.0,7.0,7.0,7.0,7.0,,,,,
2,1,1.0,0,1,1,1,10,7,,10,...,5.0,7.0,7.0,7.0,7.0,,,,,
3,1,1.0,0,1,1,1,10,7,,5,...,5.0,7.0,7.0,7.0,7.0,,,,,
4,1,1.0,0,1,1,1,10,7,,7,...,5.0,7.0,7.0,7.0,7.0,,,,,


In [3]:
# ============ Data Engineering ==================

In [4]:
pd.crosstab(index=data['match'],columns="count")

col_0,count
match,Unnamed: 1_level_1
0,6998
1,1380


In [5]:
key = data.keys()
idx = range(0,len(key))
z = zip(idx, key)
for i in z:
	print(i)

(0, 'iid')
(1, 'id')
(2, 'gender')
(3, 'idg')
(4, 'condtn')
(5, 'wave')
(6, 'round')
(7, 'position')
(8, 'positin1')
(9, 'order')
(10, 'partner')
(11, 'pid')
(12, 'match')
(13, 'int_corr')
(14, 'samerace')
(15, 'age_o')
(16, 'race_o')
(17, 'pf_o_att')
(18, 'pf_o_sin')
(19, 'pf_o_int')
(20, 'pf_o_fun')
(21, 'pf_o_amb')
(22, 'pf_o_sha')
(23, 'dec_o')
(24, 'attr_o')
(25, 'sinc_o')
(26, 'intel_o')
(27, 'fun_o')
(28, 'amb_o')
(29, 'shar_o')
(30, 'like_o')
(31, 'prob_o')
(32, 'met_o')
(33, 'age')
(34, 'field')
(35, 'field_cd')
(36, 'undergra')
(37, 'mn_sat')
(38, 'tuition')
(39, 'race')
(40, 'imprace')
(41, 'imprelig')
(42, 'from')
(43, 'zipcode')
(44, 'income')
(45, 'goal')
(46, 'date')
(47, 'go_out')
(48, 'career')
(49, 'career_c')
(50, 'sports')
(51, 'tvsports')
(52, 'exercise')
(53, 'dining')
(54, 'museums')
(55, 'art')
(56, 'hiking')
(57, 'gaming')
(58, 'clubbing')
(59, 'reading')
(60, 'tv')
(61, 'theater')
(62, 'movies')
(63, 'concerts')
(64, 'music')
(65, 'shopping')
(66, 'yoga')
(67,

In [6]:
# count nulls
data.isnull().sum()

iid            0
id             1
gender         0
idg            0
condtn         0
wave           0
round          0
position       0
positin1    1846
order          0
partner        0
pid           10
match          0
int_corr     158
samerace       0
age_o        104
race_o        73
pf_o_att      89
pf_o_sin      89
pf_o_int      89
pf_o_fun      98
pf_o_amb     107
pf_o_sha     129
dec_o          0
attr_o       212
sinc_o       287
intel_o      306
fun_o        360
amb_o        722
shar_o      1076
like_o       250
prob_o       318
met_o        385
age           95
field         63
field_cd      82
undergra    3464
mn_sat      5245
tuition     4795
race          63
imprace       79
imprelig      79
from          79
zipcode     1064
income      4099
goal          79
date          97
go_out        79
career        89
career_c     138
sports        79
tvsports      79
exercise      79
dining        79
museums       79
art           79
hiking        79
gaming        79
clubbing      

In [7]:
# keep relevant columns
data = pd.concat([data.iloc[:, 0], # iid
                  data.iloc[:, 2], # gender
                  data.iloc[:, 5], # wave
                  data.iloc[:, 11:15], # pid, match, int_corr, samerace
                  data.iloc[:, 24:30], # attr_o (obj measure from partner)
                  data.iloc[:, 33], # age
                  data.iloc[:, 35], # field_cd
                  data.iloc[:, 39], # race
                  data.iloc[:, 40:42], # imprace, imprelig
                  data.iloc[:, 46:48], # date, go_out
                  #data.iloc[:, 49], # career_c (missing a lot)
                  data.iloc[:, 50:67], # 17 interests
                  data.iloc[:, 70:75], # attr1_1 (subjective preference)
                  data.iloc[:, 87:92] # attr3_1 (subjective measure)
                  ], 
                  axis=1)

In [8]:
# drop null rows
data = data.dropna()

In [9]:
data.head(5)

Unnamed: 0,iid,gender,wave,pid,match,int_corr,samerace,attr_o,sinc_o,intel_o,...,sinc1_1,intel1_1,fun1_1,amb1_1,shar1_1,attr3_1,sinc3_1,fun3_1,intel3_1,amb3_1
0,1,0,1,11.0,0,0.14,0,6.0,8.0,8.0,...,20.0,20.0,15.0,15.0,15.0,6.0,8.0,8.0,8.0,7.0
1,1,0,1,12.0,0,0.54,0,7.0,8.0,10.0,...,20.0,20.0,15.0,15.0,15.0,6.0,8.0,8.0,8.0,7.0
2,1,0,1,13.0,1,0.16,1,10.0,10.0,10.0,...,20.0,20.0,15.0,15.0,15.0,6.0,8.0,8.0,8.0,7.0
3,1,0,1,14.0,1,0.61,0,7.0,8.0,9.0,...,20.0,20.0,15.0,15.0,15.0,6.0,8.0,8.0,8.0,7.0
4,1,0,1,15.0,1,0.21,0,8.0,7.0,9.0,...,20.0,20.0,15.0,15.0,15.0,6.0,8.0,8.0,8.0,7.0


In [10]:
# profile data

In [11]:
key = data.keys()
idx = range(0,len(key))
z = zip(idx, key)
for i in z:
	print(i)

(0, 'iid')
(1, 'gender')
(2, 'wave')
(3, 'pid')
(4, 'match')
(5, 'int_corr')
(6, 'samerace')
(7, 'attr_o')
(8, 'sinc_o')
(9, 'intel_o')
(10, 'fun_o')
(11, 'amb_o')
(12, 'shar_o')
(13, 'age')
(14, 'field_cd')
(15, 'race')
(16, 'imprace')
(17, 'imprelig')
(18, 'date')
(19, 'go_out')
(20, 'sports')
(21, 'tvsports')
(22, 'exercise')
(23, 'dining')
(24, 'museums')
(25, 'art')
(26, 'hiking')
(27, 'gaming')
(28, 'clubbing')
(29, 'reading')
(30, 'tv')
(31, 'theater')
(32, 'movies')
(33, 'concerts')
(34, 'music')
(35, 'shopping')
(36, 'yoga')
(37, 'sinc1_1')
(38, 'intel1_1')
(39, 'fun1_1')
(40, 'amb1_1')
(41, 'shar1_1')
(42, 'attr3_1')
(43, 'sinc3_1')
(44, 'fun3_1')
(45, 'intel3_1')
(46, 'amb3_1')


In [12]:
profile = pd.concat([data.iloc[:, 0:3], # gender, wave
                     data.iloc[:, 7:47]
                    ], 
                    axis=1)
iids = profile['iid'].unique()
print(len(iids)) # remaining number of people
print(iids)      # original iids

536
[  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18
  19  20  21  22  23  24  25  26  27  29  30  31  32  33  34  35  36  37
  38  39  41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56
  57  60  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76
  77  78  79  80  81  82  83  84  85  86  87  88  89  90  91  92  93  94
  95  96  97  98  99 100 101 102 103 104 105 106 107 108 109 110 111 112
 113 114 115 116 117 119 120 121 122 123 124 125 126 127 128 131 132 133
 134 135 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152
 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170
 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188
 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206
 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224
 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242
 243 244 245 246 247 248 249 250 251 252 253 25

In [13]:
keys = profile.keys()
idx = range(0,len(keys))
z = zip(idx, keys)
for i in z:
	print(i)

(0, 'iid')
(1, 'gender')
(2, 'wave')
(3, 'attr_o')
(4, 'sinc_o')
(5, 'intel_o')
(6, 'fun_o')
(7, 'amb_o')
(8, 'shar_o')
(9, 'age')
(10, 'field_cd')
(11, 'race')
(12, 'imprace')
(13, 'imprelig')
(14, 'date')
(15, 'go_out')
(16, 'sports')
(17, 'tvsports')
(18, 'exercise')
(19, 'dining')
(20, 'museums')
(21, 'art')
(22, 'hiking')
(23, 'gaming')
(24, 'clubbing')
(25, 'reading')
(26, 'tv')
(27, 'theater')
(28, 'movies')
(29, 'concerts')
(30, 'music')
(31, 'shopping')
(32, 'yoga')
(33, 'sinc1_1')
(34, 'intel1_1')
(35, 'fun1_1')
(36, 'amb1_1')
(37, 'shar1_1')
(38, 'attr3_1')
(39, 'sinc3_1')
(40, 'fun3_1')
(41, 'intel3_1')
(42, 'amb3_1')


In [14]:
# profile data (subj + obj)
profile = profile.groupby('iid').mean()
profile.head(5)

Unnamed: 0_level_0,gender,wave,attr_o,sinc_o,intel_o,fun_o,amb_o,shar_o,age,field_cd,...,sinc1_1,intel1_1,fun1_1,amb1_1,shar1_1,attr3_1,sinc3_1,fun3_1,intel3_1,amb3_1
iid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,1,6.7,7.4,8.0,7.2,8.0,7.1,21.0,1.0,...,20.0,20.0,15.0,15.0,15.0,6.0,8.0,8.0,8.0,7.0
2,0,1,7.7,7.1,7.9,7.5,7.5,6.5,24.0,1.0,...,5.0,25.0,20.0,0.0,5.0,7.0,5.0,10.0,8.0,3.0
3,0,1,6.555556,6.777778,7.222222,6.222222,7.111111,6.0,25.0,2.0,...,10.0,35.0,10.0,10.0,0.0,8.0,9.0,8.0,9.0,8.0
4,0,1,7.0,7.1,7.7,7.5,7.7,7.2,23.0,1.0,...,20.0,20.0,20.0,10.0,10.0,7.0,8.0,9.0,7.0,8.0
5,0,1,5.3,7.7,7.6,7.2,7.8,6.2,21.0,1.0,...,5.0,25.0,25.0,10.0,15.0,6.0,3.0,6.0,10.0,8.0


In [15]:
ids = range(0, len(profile))
profile.insert(loc=0, column='id', value=ids) # relabel iid (start from 0, consecutive number)

In [16]:
# pair data
meet = pd.concat([data.iloc[:, 0], # id
                  data.iloc[:, 3], # pid
                  data.iloc[:, 4] # match
                  ], 
                  axis=1)

for index, row in meet.iterrows():
    if len(np.where(iids == row['pid'])[0]) != 1: # invalid pid
        meet.drop(index, inplace=True)
        
for index, row in meet.iterrows():
    if len(np.where(iids == row['iid'])[0]) != 1: # invalid iid
        meet.drop(index, inplace=True)
        print("should not be here")

pid = meet.pop('pid')  
#idys = [np.where(iid == p)[0] for p in pid]
idys = [list(iids).index(p) for p in pid]
iid = meet.pop('iid')
idxs = [list(iids).index(p) for p in iid]
meet.insert(loc=0, column='pid', value=idys) # new pid
meet.insert(loc=0, column='iid', value=idxs) # new iid

In [17]:
wave = profile['wave'].values
print(wave)

[ 1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  2  2  2  2
  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2
  2  2  2  2  2  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  4
  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4
  4  4  4  4  4  4  4  4  4  4  4  5  5  5  5  5  5  5  5  5  5  5  5  5
  5  5  5  5  6  6  6  6  6  6  6  6  6  7  7  7  7  7  7  7  7  7  7  7
  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  8  8  8
  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  9  9  9  9  9  9  9
  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9
  9  9  9  9  9  9  9  9  9 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
 10 10 10 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11
 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 12 12 12
 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12
 12 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13

In [18]:
meet.head(5)

Unnamed: 0,iid,pid,match
0,0,10,0
1,0,11,0
2,0,12,1
3,0,13,1
4,0,14,1


In [19]:
profile.index = profile.index.rename('index')
df = profile.pop('id')
val = df.values
profile.insert(loc=0, column='iid', value=val)
profile.head(5)

Unnamed: 0_level_0,iid,gender,wave,attr_o,sinc_o,intel_o,fun_o,amb_o,shar_o,age,...,sinc1_1,intel1_1,fun1_1,amb1_1,shar1_1,attr3_1,sinc3_1,fun3_1,intel3_1,amb3_1
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,1,6.7,7.4,8.0,7.2,8.0,7.1,21.0,...,20.0,20.0,15.0,15.0,15.0,6.0,8.0,8.0,8.0,7.0
2,1,0,1,7.7,7.1,7.9,7.5,7.5,6.5,24.0,...,5.0,25.0,20.0,0.0,5.0,7.0,5.0,10.0,8.0,3.0
3,2,0,1,6.555556,6.777778,7.222222,6.222222,7.111111,6.0,25.0,...,10.0,35.0,10.0,10.0,0.0,8.0,9.0,8.0,9.0,8.0
4,3,0,1,7.0,7.1,7.7,7.5,7.7,7.2,23.0,...,20.0,20.0,20.0,10.0,10.0,7.0,8.0,9.0,7.0,8.0
5,4,0,1,5.3,7.7,7.6,7.2,7.8,6.2,21.0,...,5.0,25.0,25.0,10.0,15.0,6.0,3.0,6.0,10.0,8.0


In [20]:
left = meet
right = profile.copy()

In [21]:
profile.head()

Unnamed: 0_level_0,iid,gender,wave,attr_o,sinc_o,intel_o,fun_o,amb_o,shar_o,age,...,sinc1_1,intel1_1,fun1_1,amb1_1,shar1_1,attr3_1,sinc3_1,fun3_1,intel3_1,amb3_1
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,1,6.7,7.4,8.0,7.2,8.0,7.1,21.0,...,20.0,20.0,15.0,15.0,15.0,6.0,8.0,8.0,8.0,7.0
2,1,0,1,7.7,7.1,7.9,7.5,7.5,6.5,24.0,...,5.0,25.0,20.0,0.0,5.0,7.0,5.0,10.0,8.0,3.0
3,2,0,1,6.555556,6.777778,7.222222,6.222222,7.111111,6.0,25.0,...,10.0,35.0,10.0,10.0,0.0,8.0,9.0,8.0,9.0,8.0
4,3,0,1,7.0,7.1,7.7,7.5,7.7,7.2,23.0,...,20.0,20.0,20.0,10.0,10.0,7.0,8.0,9.0,7.0,8.0
5,4,0,1,5.3,7.7,7.6,7.2,7.8,6.2,21.0,...,5.0,25.0,25.0,10.0,15.0,6.0,3.0,6.0,10.0,8.0


In [22]:
mid = pd.merge(left, right, on='iid', how='left')

In [23]:
mid.head()

Unnamed: 0,iid,pid,match,gender,wave,attr_o,sinc_o,intel_o,fun_o,amb_o,...,sinc1_1,intel1_1,fun1_1,amb1_1,shar1_1,attr3_1,sinc3_1,fun3_1,intel3_1,amb3_1
0,0,10,0,0,1,6.7,7.4,8.0,7.2,8.0,...,20.0,20.0,15.0,15.0,15.0,6.0,8.0,8.0,8.0,7.0
1,0,11,0,0,1,6.7,7.4,8.0,7.2,8.0,...,20.0,20.0,15.0,15.0,15.0,6.0,8.0,8.0,8.0,7.0
2,0,12,1,0,1,6.7,7.4,8.0,7.2,8.0,...,20.0,20.0,15.0,15.0,15.0,6.0,8.0,8.0,8.0,7.0
3,0,13,1,0,1,6.7,7.4,8.0,7.2,8.0,...,20.0,20.0,15.0,15.0,15.0,6.0,8.0,8.0,8.0,7.0
4,0,14,1,0,1,6.7,7.4,8.0,7.2,8.0,...,20.0,20.0,15.0,15.0,15.0,6.0,8.0,8.0,8.0,7.0


In [24]:
right.index.names = ['pid']
right.head(5)

Unnamed: 0_level_0,iid,gender,wave,attr_o,sinc_o,intel_o,fun_o,amb_o,shar_o,age,...,sinc1_1,intel1_1,fun1_1,amb1_1,shar1_1,attr3_1,sinc3_1,fun3_1,intel3_1,amb3_1
pid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,1,6.7,7.4,8.0,7.2,8.0,7.1,21.0,...,20.0,20.0,15.0,15.0,15.0,6.0,8.0,8.0,8.0,7.0
2,1,0,1,7.7,7.1,7.9,7.5,7.5,6.5,24.0,...,5.0,25.0,20.0,0.0,5.0,7.0,5.0,10.0,8.0,3.0
3,2,0,1,6.555556,6.777778,7.222222,6.222222,7.111111,6.0,25.0,...,10.0,35.0,10.0,10.0,0.0,8.0,9.0,8.0,9.0,8.0
4,3,0,1,7.0,7.1,7.7,7.5,7.7,7.2,23.0,...,20.0,20.0,20.0,10.0,10.0,7.0,8.0,9.0,7.0,8.0
5,4,0,1,5.3,7.7,7.6,7.2,7.8,6.2,21.0,...,5.0,25.0,25.0,10.0,15.0,6.0,3.0,6.0,10.0,8.0


In [25]:
right = profile.copy()
df = right.pop('iid')
val = df.values
right.insert(loc=0, column='pid', value=val)
right.head(5)

Unnamed: 0_level_0,pid,gender,wave,attr_o,sinc_o,intel_o,fun_o,amb_o,shar_o,age,...,sinc1_1,intel1_1,fun1_1,amb1_1,shar1_1,attr3_1,sinc3_1,fun3_1,intel3_1,amb3_1
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,1,6.7,7.4,8.0,7.2,8.0,7.1,21.0,...,20.0,20.0,15.0,15.0,15.0,6.0,8.0,8.0,8.0,7.0
2,1,0,1,7.7,7.1,7.9,7.5,7.5,6.5,24.0,...,5.0,25.0,20.0,0.0,5.0,7.0,5.0,10.0,8.0,3.0
3,2,0,1,6.555556,6.777778,7.222222,6.222222,7.111111,6.0,25.0,...,10.0,35.0,10.0,10.0,0.0,8.0,9.0,8.0,9.0,8.0
4,3,0,1,7.0,7.1,7.7,7.5,7.7,7.2,23.0,...,20.0,20.0,20.0,10.0,10.0,7.0,8.0,9.0,7.0,8.0
5,4,0,1,5.3,7.7,7.6,7.2,7.8,6.2,21.0,...,5.0,25.0,25.0,10.0,15.0,6.0,3.0,6.0,10.0,8.0


In [26]:
# profile pairs
join = pd.merge(mid, right, on='pid', how='left')
join.head(5)

Unnamed: 0,iid,pid,match,gender_x,wave_x,attr_o_x,sinc_o_x,intel_o_x,fun_o_x,amb_o_x,...,sinc1_1_y,intel1_1_y,fun1_1_y,amb1_1_y,shar1_1_y,attr3_1_y,sinc3_1_y,fun3_1_y,intel3_1_y,amb3_1_y
0,0,10,0,0,1,6.7,7.4,8.0,7.2,8.0,...,20.0,20.0,20.0,0.0,5.0,8.0,9.0,7.0,8.0,5.0
1,0,11,0,0,1,6.7,7.4,8.0,7.2,8.0,...,0.0,0.0,40.0,0.0,0.0,9.0,9.0,9.0,10.0,9.0
2,0,12,1,0,1,6.7,7.4,8.0,7.2,8.0,...,18.0,19.0,18.0,14.0,12.0,4.0,7.0,8.0,8.0,3.0
3,0,13,1,0,1,6.7,7.4,8.0,7.2,8.0,...,5.0,15.0,40.0,5.0,5.0,9.0,9.0,9.0,9.0,9.0
4,0,14,1,0,1,6.7,7.4,8.0,7.2,8.0,...,10.0,20.0,10.0,10.0,20.0,7.0,7.0,7.0,9.0,9.0


In [27]:
join.to_csv('../data/pair.csv',index=False)

In [28]:
key = join.keys()
idx = range(0,len(key))
z = zip(idx, key)
for i in z:
	print(i)

(0, 'iid')
(1, 'pid')
(2, 'match')
(3, 'gender_x')
(4, 'wave_x')
(5, 'attr_o_x')
(6, 'sinc_o_x')
(7, 'intel_o_x')
(8, 'fun_o_x')
(9, 'amb_o_x')
(10, 'shar_o_x')
(11, 'age_x')
(12, 'field_cd_x')
(13, 'race_x')
(14, 'imprace_x')
(15, 'imprelig_x')
(16, 'date_x')
(17, 'go_out_x')
(18, 'sports_x')
(19, 'tvsports_x')
(20, 'exercise_x')
(21, 'dining_x')
(22, 'museums_x')
(23, 'art_x')
(24, 'hiking_x')
(25, 'gaming_x')
(26, 'clubbing_x')
(27, 'reading_x')
(28, 'tv_x')
(29, 'theater_x')
(30, 'movies_x')
(31, 'concerts_x')
(32, 'music_x')
(33, 'shopping_x')
(34, 'yoga_x')
(35, 'sinc1_1_x')
(36, 'intel1_1_x')
(37, 'fun1_1_x')
(38, 'amb1_1_x')
(39, 'shar1_1_x')
(40, 'attr3_1_x')
(41, 'sinc3_1_x')
(42, 'fun3_1_x')
(43, 'intel3_1_x')
(44, 'amb3_1_x')
(45, 'gender_y')
(46, 'wave_y')
(47, 'attr_o_y')
(48, 'sinc_o_y')
(49, 'intel_o_y')
(50, 'fun_o_y')
(51, 'amb_o_y')
(52, 'shar_o_y')
(53, 'age_y')
(54, 'field_cd_y')
(55, 'race_y')
(56, 'imprace_y')
(57, 'imprelig_y')
(58, 'date_y')
(59, 'go_out_y')
(6

In [29]:
join = join.fillna(0)

In [30]:
# ===============  subjective profile pair ==============
subj = pd.concat([join.iloc[:, 0:5],
                  join.iloc[:, 11:47],
                  join.iloc[:, 53:87]
                 ],
                  axis=1)
subj['age_diff'] = subj['age_x'] - subj['age_y']
subj['samerace'] = (subj['race_x'] == subj['race_y'])
samerace = subj.pop('samerace')
samerace = samerace.astype(int)
subj['samerace'] = samerace
df = subj.pop('match')
subj['match'] = df
subj.to_csv('../data/s_pair.csv',index=False)

In [31]:
key = subj.keys()
idx = range(0,len(key))
z = zip(idx, key)
for i in z:
	print(i)

(0, 'iid')
(1, 'pid')
(2, 'gender_x')
(3, 'wave_x')
(4, 'age_x')
(5, 'field_cd_x')
(6, 'race_x')
(7, 'imprace_x')
(8, 'imprelig_x')
(9, 'date_x')
(10, 'go_out_x')
(11, 'sports_x')
(12, 'tvsports_x')
(13, 'exercise_x')
(14, 'dining_x')
(15, 'museums_x')
(16, 'art_x')
(17, 'hiking_x')
(18, 'gaming_x')
(19, 'clubbing_x')
(20, 'reading_x')
(21, 'tv_x')
(22, 'theater_x')
(23, 'movies_x')
(24, 'concerts_x')
(25, 'music_x')
(26, 'shopping_x')
(27, 'yoga_x')
(28, 'sinc1_1_x')
(29, 'intel1_1_x')
(30, 'fun1_1_x')
(31, 'amb1_1_x')
(32, 'shar1_1_x')
(33, 'attr3_1_x')
(34, 'sinc3_1_x')
(35, 'fun3_1_x')
(36, 'intel3_1_x')
(37, 'amb3_1_x')
(38, 'gender_y')
(39, 'wave_y')
(40, 'age_y')
(41, 'field_cd_y')
(42, 'race_y')
(43, 'imprace_y')
(44, 'imprelig_y')
(45, 'date_y')
(46, 'go_out_y')
(47, 'sports_y')
(48, 'tvsports_y')
(49, 'exercise_y')
(50, 'dining_y')
(51, 'museums_y')
(52, 'art_y')
(53, 'hiking_y')
(54, 'gaming_y')
(55, 'clubbing_y')
(56, 'reading_y')
(57, 'tv_y')
(58, 'theater_y')
(59, 'movies

In [32]:
# ======================  objective profile pair ==============
obj = pd.concat([join.iloc[:, 0:10],
                 join.iloc[:, 11:40],
                 join.iloc[:, 45:52],
                 join.iloc[:, 53:82]
                 ],
                  axis=1)
obj['age_diff'] = obj['age_x'] - obj['age_y']
obj['samerace'] = (obj['race_x'] == obj['race_y'])
samerace = obj.pop('samerace')
samerace = samerace.astype(int)
obj['samerace'] = samerace
df = obj.pop('match')
obj['match'] = df
obj.to_csv('../data/o_pair.csv',index=False)

In [33]:
key = obj.keys()
idx = range(0,len(key))
z = zip(idx, key)
for i in z:
	print(i)

(0, 'iid')
(1, 'pid')
(2, 'gender_x')
(3, 'wave_x')
(4, 'attr_o_x')
(5, 'sinc_o_x')
(6, 'intel_o_x')
(7, 'fun_o_x')
(8, 'amb_o_x')
(9, 'age_x')
(10, 'field_cd_x')
(11, 'race_x')
(12, 'imprace_x')
(13, 'imprelig_x')
(14, 'date_x')
(15, 'go_out_x')
(16, 'sports_x')
(17, 'tvsports_x')
(18, 'exercise_x')
(19, 'dining_x')
(20, 'museums_x')
(21, 'art_x')
(22, 'hiking_x')
(23, 'gaming_x')
(24, 'clubbing_x')
(25, 'reading_x')
(26, 'tv_x')
(27, 'theater_x')
(28, 'movies_x')
(29, 'concerts_x')
(30, 'music_x')
(31, 'shopping_x')
(32, 'yoga_x')
(33, 'sinc1_1_x')
(34, 'intel1_1_x')
(35, 'fun1_1_x')
(36, 'amb1_1_x')
(37, 'shar1_1_x')
(38, 'gender_y')
(39, 'wave_y')
(40, 'attr_o_y')
(41, 'sinc_o_y')
(42, 'intel_o_y')
(43, 'fun_o_y')
(44, 'amb_o_y')
(45, 'age_y')
(46, 'field_cd_y')
(47, 'race_y')
(48, 'imprace_y')
(49, 'imprelig_y')
(50, 'date_y')
(51, 'go_out_y')
(52, 'sports_y')
(53, 'tvsports_y')
(54, 'exercise_y')
(55, 'dining_y')
(56, 'museums_y')
(57, 'art_y')
(58, 'hiking_y')
(59, 'gaming_y')
(

In [34]:
obj.head(5)

Unnamed: 0,iid,pid,gender_x,wave_x,attr_o_x,sinc_o_x,intel_o_x,fun_o_x,amb_o_x,age_x,...,shopping_y,yoga_y,sinc1_1_y,intel1_1_y,fun1_1_y,amb1_1_y,shar1_1_y,age_diff,samerace,match
0,0,10,0,1,6.7,7.4,8.0,7.2,8.0,21.0,...,5.0,1.0,20.0,20.0,20.0,0.0,5.0,-6.0,0,0
1,0,11,0,1,6.7,7.4,8.0,7.2,8.0,21.0,...,5.0,5.0,0.0,0.0,40.0,0.0,0.0,-1.0,0,0
2,0,12,0,1,6.7,7.4,8.0,7.2,8.0,21.0,...,8.0,1.0,18.0,19.0,18.0,14.0,12.0,-1.0,1,1
3,0,13,0,1,6.7,7.4,8.0,7.2,8.0,21.0,...,6.0,1.0,5.0,15.0,40.0,5.0,5.0,-2.0,0,1
4,0,14,0,1,6.7,7.4,8.0,7.2,8.0,21.0,...,2.0,1.0,10.0,20.0,10.0,10.0,20.0,-3.0,0,1


In [35]:
key = profile.keys()
idx = range(0,len(key))
z = zip(idx, key)
for i in z:
	print(i)

(0, 'iid')
(1, 'gender')
(2, 'wave')
(3, 'attr_o')
(4, 'sinc_o')
(5, 'intel_o')
(6, 'fun_o')
(7, 'amb_o')
(8, 'shar_o')
(9, 'age')
(10, 'field_cd')
(11, 'race')
(12, 'imprace')
(13, 'imprelig')
(14, 'date')
(15, 'go_out')
(16, 'sports')
(17, 'tvsports')
(18, 'exercise')
(19, 'dining')
(20, 'museums')
(21, 'art')
(22, 'hiking')
(23, 'gaming')
(24, 'clubbing')
(25, 'reading')
(26, 'tv')
(27, 'theater')
(28, 'movies')
(29, 'concerts')
(30, 'music')
(31, 'shopping')
(32, 'yoga')
(33, 'sinc1_1')
(34, 'intel1_1')
(35, 'fun1_1')
(36, 'amb1_1')
(37, 'shar1_1')
(38, 'attr3_1')
(39, 'sinc3_1')
(40, 'fun3_1')
(41, 'intel3_1')
(42, 'amb3_1')


In [36]:
# obj profile
profile = pd.concat([profile.iloc[:, 0:37]
                     ],
                     axis=1)

profile.pop('shar_o')

index
1      7.100000
2      6.500000
3      6.000000
4      7.200000
5      6.200000
6      6.300000
7      6.700000
8      6.777778
9      6.300000
10     5.625000
11     5.222222
12     6.111111
13     5.888889
14     7.444444
15     6.666667
16     4.666667
17     6.666667
18     5.555556
19     7.700000
20     7.300000
21     5.000000
22     4.181818
23     4.923077
24     5.461538
25     4.833333
26     6.000000
27     5.666667
29     4.692308
30     5.090909
31     5.818182
32     5.428571
33     5.714286
34     5.214286
35     5.153846
36     5.230769
37     5.181818
38     6.000000
39     5.357143
41     3.500000
42     4.928571
43     4.000000
44     4.176471
45     4.117647
46     5.076923
47     4.705882
48     5.176471
49     6.066667
50     5.866667
51     4.250000
52     4.687500
53     5.214286
54     4.750000
55     5.437500
56     4.888889
57     4.666667
60     4.666667
61     4.222222
62     6.375000
63     4.625000
64     4.700000
65     5.700000
66     4.571429
67

In [37]:
ids = range(0, len(profile))
profile.insert(loc=0, column='id', value=ids)
print(ids)

range(0, 536)


In [38]:
profile.head(5)

Unnamed: 0_level_0,id,iid,gender,wave,attr_o,sinc_o,intel_o,fun_o,amb_o,age,...,theater,movies,concerts,music,shopping,yoga,sinc1_1,intel1_1,fun1_1,amb1_1
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,1,6.7,7.4,8.0,7.2,8.0,21.0,...,1.0,10.0,10.0,9.0,8.0,1.0,20.0,20.0,15.0,15.0
2,1,1,0,1,7.7,7.1,7.9,7.5,7.5,24.0,...,9.0,8.0,7.0,8.0,3.0,1.0,5.0,25.0,20.0,0.0
3,2,2,0,1,6.555556,6.777778,7.222222,6.222222,7.111111,25.0,...,7.0,7.0,7.0,5.0,8.0,7.0,10.0,35.0,10.0,10.0
4,3,3,0,1,7.0,7.1,7.7,7.5,7.7,23.0,...,9.0,7.0,8.0,7.0,1.0,8.0,20.0,20.0,20.0,10.0
5,4,4,0,1,5.3,7.7,7.6,7.2,7.8,21.0,...,6.0,6.0,3.0,7.0,8.0,3.0,5.0,25.0,25.0,10.0


In [39]:
df1 = profile.copy()
df2 = profile.copy()

In [40]:
# cartesian product (profile, profile)
car = (
    df1.assign(key=1)
    .merge(df2.assign(key=1), on="key")
    .drop("key", axis=1)
)
car.head(5)

Unnamed: 0,id_x,iid_x,gender_x,wave_x,attr_o_x,sinc_o_x,intel_o_x,fun_o_x,amb_o_x,age_x,...,theater_y,movies_y,concerts_y,music_y,shopping_y,yoga_y,sinc1_1_y,intel1_1_y,fun1_1_y,amb1_1_y
0,0,0,0,1,6.7,7.4,8.0,7.2,8.0,21.0,...,1.0,10.0,10.0,9.0,8.0,1.0,20.0,20.0,15.0,15.0
1,0,0,0,1,6.7,7.4,8.0,7.2,8.0,21.0,...,9.0,8.0,7.0,8.0,3.0,1.0,5.0,25.0,20.0,0.0
2,0,0,0,1,6.7,7.4,8.0,7.2,8.0,21.0,...,7.0,7.0,7.0,5.0,8.0,7.0,10.0,35.0,10.0,10.0
3,0,0,0,1,6.7,7.4,8.0,7.2,8.0,21.0,...,9.0,7.0,8.0,7.0,1.0,8.0,20.0,20.0,20.0,10.0
4,0,0,0,1,6.7,7.4,8.0,7.2,8.0,21.0,...,6.0,6.0,3.0,7.0,8.0,3.0,5.0,25.0,25.0,10.0


In [41]:
iid = car.pop('id_x')
iid = iid.values
pid = car.pop('id_y')
pid = pid.values
car.insert(loc=0, column='pid', value=pid)
car.insert(loc=0, column='iid', value=iid)


In [42]:
car.head(5)

Unnamed: 0,iid,pid,iid_x,gender_x,wave_x,attr_o_x,sinc_o_x,intel_o_x,fun_o_x,amb_o_x,...,theater_y,movies_y,concerts_y,music_y,shopping_y,yoga_y,sinc1_1_y,intel1_1_y,fun1_1_y,amb1_1_y
0,0,0,0,0,1,6.7,7.4,8.0,7.2,8.0,...,1.0,10.0,10.0,9.0,8.0,1.0,20.0,20.0,15.0,15.0
1,0,1,0,0,1,6.7,7.4,8.0,7.2,8.0,...,9.0,8.0,7.0,8.0,3.0,1.0,5.0,25.0,20.0,0.0
2,0,2,0,0,1,6.7,7.4,8.0,7.2,8.0,...,7.0,7.0,7.0,5.0,8.0,7.0,10.0,35.0,10.0,10.0
3,0,3,0,0,1,6.7,7.4,8.0,7.2,8.0,...,9.0,7.0,8.0,7.0,1.0,8.0,20.0,20.0,20.0,10.0
4,0,4,0,0,1,6.7,7.4,8.0,7.2,8.0,...,6.0,6.0,3.0,7.0,8.0,3.0,5.0,25.0,25.0,10.0


In [43]:
car['age_diff'] = car['age_x'] - car['age_y']
car['samerace'] = (car['race_x'] == car['race_y'])
samerace = car.pop('samerace')
samerace = samerace.astype(int)
car['samerace'] = samerace

In [44]:
car.head(5)

Unnamed: 0,iid,pid,iid_x,gender_x,wave_x,attr_o_x,sinc_o_x,intel_o_x,fun_o_x,amb_o_x,...,concerts_y,music_y,shopping_y,yoga_y,sinc1_1_y,intel1_1_y,fun1_1_y,amb1_1_y,age_diff,samerace
0,0,0,0,0,1,6.7,7.4,8.0,7.2,8.0,...,10.0,9.0,8.0,1.0,20.0,20.0,15.0,15.0,0.0,1
1,0,1,0,0,1,6.7,7.4,8.0,7.2,8.0,...,7.0,8.0,3.0,1.0,5.0,25.0,20.0,0.0,-3.0,0
2,0,2,0,0,1,6.7,7.4,8.0,7.2,8.0,...,7.0,5.0,8.0,7.0,10.0,35.0,10.0,10.0,-4.0,0
3,0,3,0,0,1,6.7,7.4,8.0,7.2,8.0,...,8.0,7.0,1.0,8.0,20.0,20.0,20.0,10.0,-2.0,0
4,0,4,0,0,1,6.7,7.4,8.0,7.2,8.0,...,3.0,7.0,8.0,3.0,5.0,25.0,25.0,10.0,0.0,0


In [45]:
len(car)

287296

In [46]:
car.to_csv('../data/allpair.csv',index=False)