In [15]:
import numpy as np
import pandas as pd

In [16]:
quotes_changed = 0
lowered_case = 0
label_encoding = {}

def perform_label_encoding(column):
    column = column.astype('category')
    codes_for_column = {}
    for i, category in enumerate(column.cat.categories):
        codes_for_column[category] = i
    label_encoding[column.name] = codes_for_column
    return column.cat.codes

def remove_quotes(x):
    global quotes_changed
    if "'" in x:
        quotes_changed = quotes_changed + 1
        return x.replace("'", "")
    else:
        return x

def to_lower(x):
    global lowered_case
    if x.islower():
        return x
    else:
        lowered_case = lowered_case + 1
        return x.lower()



In [17]:
#Read the dataset
data = pd.read_csv('dating-full.csv')
decision_col = data['decision']

#Remove quotes
data['race'] = data['race'].apply(remove_quotes)
data['race_o'] = data['race_o'].apply(remove_quotes)
data['field'] = data['field'].apply(remove_quotes)

#Convert to lowercase
data['field'] = data['field'].apply(to_lower)

# print('Quotes removed from '+str(quotes_changed) + ' cells.')
# print('Standardized '+str(lowered_case) + ' cells to lower case.')


#Label encode
data[['race','race_o','gender','field']] = data[['race','race_o','gender','field']].apply(perform_label_encoding)


#Normalize preference scores of the participant
columns1  = ['attractive_important', 'sincere_important', 'intelligence_important','funny_important', 'ambition_important', 'shared_interests_important']
columns2  = ['pref_o_attractive', 'pref_o_sincere', 'pref_o_intelligence',
             'pref_o_funny', 'pref_o_ambitious', 'pref_o_shared_interests']
data[columns1] = data[columns1].div(data[columns1].sum(axis=1), axis=0)
data[columns2] = data[columns2].div(data[columns2].sum(axis=1), axis=0)


#Move the target class to the end
data = data.drop(['decision'], axis = 1)
data['decision'] = decision_col

#Save the csv file
# data.to_csv('dating.csv', index = False)

In [18]:
df = data[0:6500].copy()
df.drop(columns = ['race', 'race_o', 'field'], inplace = True)
df

Unnamed: 0,gender,age,age_o,samerace,importance_same_race,importance_same_religion,pref_o_attractive,pref_o_sincere,pref_o_intelligence,pref_o_funny,...,theater,movies,concerts,music,shopping,yoga,interests_correlate,expected_happy_with_sd_people,like,decision
0,0,21,27,0,2,4,0.350000,0.200000,0.200000,0.200000,...,1,10,10,9,8,1,0.14,3,7.0,1
1,0,21,22,0,2,4,0.600000,0.000000,0.000000,0.400000,...,1,10,10,9,8,1,0.54,3,7.0,1
2,0,21,23,0,2,4,0.300000,0.050000,0.150000,0.400000,...,1,10,10,9,8,1,0.61,3,7.0,1
3,0,21,24,0,2,4,0.300000,0.100000,0.200000,0.100000,...,1,10,10,9,8,1,0.21,3,6.0,1
4,0,21,25,0,2,4,0.500000,0.000000,0.300000,0.100000,...,1,10,10,9,8,1,0.25,3,6.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6495,1,22,23,1,6,6,0.200000,0.200000,0.200000,0.150000,...,7,9,7,7,7,6,0.34,7,7.0,0
6496,1,22,23,1,6,6,0.200000,0.200000,0.200000,0.150000,...,7,9,7,7,7,6,0.27,7,7.0,1
6497,1,22,27,0,6,6,0.580000,0.050000,0.080000,0.100000,...,7,9,7,7,7,6,0.12,7,8.0,1
6498,1,22,23,1,6,6,0.181818,0.227273,0.227273,0.272727,...,7,9,7,7,7,6,0.02,7,6.0,0


In [19]:
cols_continuous = df.columns.difference(['gender', 'samerace','decision'], sort=False)

cols_spec1 = ['pref_o_attractive', 'pref_o_sincere', 'pref_o_intelligence',
       'pref_o_funny', 'pref_o_ambitious', 'pref_o_shared_interests',
       'attractive_important', 'sincere_important', 'intelligence_important',
       'funny_important', 'ambition_important', 'shared_interests_important']
    
cols_spec2 = ['age', 'age_o']

def printbin(df, col, binsize):
    
    binmin = 0
    binmax = 10;

    if (col in cols_spec1):
        filt = df[col] > 1.0
        df.loc[filt,col] = 1.0
        binmin = 0.0
        binmax = 1.0
    elif (col in cols_spec2):
        filt = df[col] > 58.0
        df.loc[filt,col] = 58.0
        binmin = 18.0
        binmax = 58.0
    elif (col == 'interests_correlate'):
        binmin = -1.0
        binmax = 1.0
    else:
        filt = df[col] > 10.0
        df.loc[filt,col] = 10.0
        binmin = 0.0
        binmax = 10.0
    
    if binmax == 0.0:
        raise Exception("Bin not updated")
           
    interval = (binmax - binmin)/binsize
    thresholds = np.linspace(binmin, binmax, binsize + 1)
    
    df[col]= pd.cut(df[col], bins=thresholds, labels = range(binsize), include_lowest = True)

    
def discretize(df, binsize):
    for x in cols_continuous:
        printbin(df, x, binsize)
        

discretize(df, 2)

In [21]:
df_1 = df.sample(random_state = 47, frac = 0.2)

df_2 = df.drop(df_1.index)

df_1.to_csv('testSet.csv', index = False)
df_2.to_csv('trainingSet.csv', index = False)

Unnamed: 0,gender,age,age_o,samerace,importance_same_race,importance_same_religion,pref_o_attractive,pref_o_sincere,pref_o_intelligence,pref_o_funny,...,theater,movies,concerts,music,shopping,yoga,interests_correlate,expected_happy_with_sd_people,like,decision
2512,1,0,0,0,1,0,0,0,0,0,...,1,1,0,0,0,0,0,1,1,1
1537,0,0,0,1,1,1,0,0,0,0,...,1,1,0,1,1,0,1,0,1,0
5618,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,0,1,0,0,0,0
3157,0,0,0,0,0,0,0,0,0,0,...,0,1,1,1,0,1,1,0,0,0
6244,0,0,0,0,1,1,0,0,0,0,...,1,1,0,0,1,0,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3186,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
59,1,0,0,1,1,0,0,0,0,0,...,0,1,1,1,0,0,1,1,1,0
6070,0,0,0,1,0,0,0,0,0,0,...,1,1,1,1,1,1,1,0,1,1
3813,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,0,0,0,1,0
