In [1]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

from imblearn.over_sampling import ADASYN
from imblearn.over_sampling import SMOTE

In [2]:
df = pd.read_csv('data/speed_dating_data.csv', encoding = "ISO-8859-1")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8378 entries, 0 to 8377
Columns: 195 entries, iid to amb5_3
dtypes: float64(174), int64(13), object(8)
memory usage: 12.5+ MB


In [4]:
df1 = df.dropna()

In [5]:
df1

Unnamed: 0,iid,id,gender,idg,condtn,wave,round,position,positin1,order,...,attr3_3,sinc3_3,intel3_3,fun3_3,amb3_3,attr5_3,sinc5_3,intel5_3,fun5_3,amb5_3


Drop all male records to eliminate duplicates in interaction records

In [6]:
df = df[df.gender == 0]

# Exploration

## Verify: match iff dec & dec_o

In [7]:
# test_df = pd.concat([
#     df.match,
#     df.dec,
#     df.dec_o
# ], axis=1)

# test_df['expected'] = test_df.apply(lambda row: row.dec&row.dec_o == row.match, axis=1)
# assert test_df.expected.sum()==len(test_df.index)

## Use self evaluation or perception by others as proxy?

In [8]:
# test_df = pd.concat([
#     df.iid, df.pid,
#     df.loc[:, 'attr3_1':'amb3_1'],  # How do you think you measure up?
#     df.loc[:, 'attr5_1':'amb5_1'],  # How do others perceive you?
# ], axis=1)

# test_df.info()

Too many missing values for "How do others perceive you" results.

# Data Cleaning / Feature Engineering

In [9]:
X = pd.concat([
    df.iid, df.partner, df.pid, # id of self and partner, to be removed later
    df.gender,
    df.age, df.age_o,
    df.int_corr, # correlation of interests
    df.samerace,
    df.goal,
    df.date,
    df.exphappy, # expected happiness with people you will meet
    df.loc[:, 'attr3_1':'amb3_1'],  # self evaluation
    df.loc[:, 'attr':'shar'],  # evaluation of partner
    df.loc[:, 'attr1_1':'shar1_1'],  # what's important to you, sum to 100 
    df.loc[:, 'pf_o_att':'pf_o_sha'], # what's important to partner, sum to 100
], axis=1)

y = pd.concat([
    df.match,  # Label for two-way prediction, whether two people will be a good match
    df.dec_o  # Label for one way prediction, whether your partner will say "yes"
], axis=1)

In [42]:
X1 = X.dropna()
len(X1)

6988

In [11]:
# assert y.match.count() == len(X.index)
# assert y.dec_o.count() == len(X.index)

In [12]:
print("Baseline accuracy for match: %f" % (1 - df.match.sum()/df.match.count()))
print("Baseline accuracy for decision: %f" % (1 - df.dec_o.sum()/df.dec_o.count()))

Baseline accuracy for match: 0.835086
Baseline accuracy for decision: 0.525335


In [13]:
df['1_1total'] = df.loc[:, 'attr1_1':'shar1_1'].sum(axis=1)

### pid (partner's unique ID)

In [14]:
# X[X.pid.isnull()]  # The missing pid comes from same person in one night

In [15]:
# # Assign a new pid to the person missing
# X.pid.fillna(X.pid.max() + 1, inplace=True)
# assert X.pid.isna().sum() == 0

### importance ratings: fill with 100/6 

In [16]:
# X.loc[:, 'attr1_1':'pf_o_sha'].isna().sum()

In [17]:
# X.attr1_1.fillna(100/6, inplace=True)
# X.sinc1_1.fillna(100/6, inplace=True)
# X.intel1_1.fillna(100/6, inplace=True)
# X.fun1_1.fillna(100/6, inplace=True)
# X.amb1_1.fillna(100/6, inplace=True)
# X.shar1_1.fillna(100/6, inplace=True)
# X.pf_o_att.fillna(100/6, inplace=True)
# X.pf_o_sin.fillna(100/6, inplace=True)
# X.pf_o_int.fillna(100/6, inplace=True)
# X.pf_o_fun.fillna(100/6, inplace=True)
# X.pf_o_amb.fillna(100/6, inplace=True)
# X.pf_o_sha.fillna(100/6, inplace=True)

In [18]:
# assert X.loc[:, 'attr1_1':'pf_o_sha'].isna().sum().sum() == 0

In [19]:
# X.date.isna().sum()

### goal: convert to indicator variables

In [20]:
one_hot = pd.get_dummies(X.goal, 
                         prefix='goal', 
                         dummy_na=True, 
                         drop_first=False)

X = X.drop('goal', axis=1)
X = X.join(one_hot)

In [21]:
##
X.fillna(X.median(), inplace=True)
y = y.drop(columns=['dec_o']).values.ravel()
X, y = SMOTE().fit_sample(X, y)

### Train-Test Split

In [22]:
X.drop(columns=['iid', 'partner', 'pid'], inplace=True)

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=.2,
                                                    stratify=y)

### age: fill with median among gender

In [23]:
# X_train.age.isnull().sum()

In [24]:
# # fill missing age values with median age among gender
# X_train['age'] = X_train.groupby('gender').transform(lambda group: group.fillna(group.median()))
# X_train['age_o'] = X_train.groupby('gender').transform(lambda group: group.fillna(group.median()))

# X_test['age'] = X_test.groupby('gender').transform(lambda group: group.fillna(group.median()))
# X_test['age_o'] = X_test.groupby('gender').transform(lambda group: group.fillna(group.median()))

In [25]:
# assert X_train.age.isna().sum() == 0
# assert X_train.age_o.isna().sum() == 0

# assert X_test.age.isna().sum() == 0
# assert X_test.age_o.isna().sum() == 0

### int_corr, date, exphappy, self evaluation: fill with median

In [26]:
X_train.fillna(X_train.median(), inplace=True)
X_test.fillna(X_test.median(), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


In [27]:
# # imp = SimpleImputer(missing_values=np.nan, strategy='mean') # .40 best
# # imp = IterativeImputer(max_iter=10) # .50 best
# imp = SimpleImputer(missing_values=np.nan, strategy='median') #.44
# X_train_i = pd.DataFrame(imp.fit_transform(X_train))
# X_train_i.columns = X_train.columns
# X_train_i.index = X_train.index

In [28]:
# # imp = SimpleImputer(missing_values=np.nan, strategy='mean')
# # imp = IterativeImputer(max_iter=10)
# imp = SimpleImputer(missing_values=np.nan, strategy='median')
# X_test_i = pd.DataFrame(imp.fit_transform(X_test))
# X_test_i.columns = X_test.columns
# X_test_i.index = X_test.index

In [41]:
# X_train

In [30]:
assert X_train.isna().sum().sum() == 0
assert X_test.isna().sum().sum() == 0

In [31]:
X_train.to_csv('data/X_train.csv', index=False)

In [32]:
X_test.to_csv('data/X_test.csv', index=False)

In [39]:
##
y_train = pd.DataFrame(y_train)
y_test = pd.DataFrame(y_test)

In [38]:
y_train.to_csv('data/y_train.csv', index=False)

In [40]:
y_test.to_csv('data/y_test.csv', index=False)