In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import metrics
from imblearn.over_sampling import SMOTE

In [2]:
'''
Count occurrences of binary class
@return counter = {class: count}
'''
def binary_counter(arr):
	bc = [0,0]
	for a in arr:
		bc[int(a)] += 1
	counter = {0 : bc[0], 1: bc[1]}
	return counter

In [3]:
smote = True

In [4]:
data = pd.read_csv('../data/o_pair.csv', encoding="ISO-8859-1")
#data = data.fillna(0)

In [5]:
key = data.keys()
idx = range(0,len(key))
z = zip(idx, key)
for i in z:
	print(i)

(0, 'iid')
(1, 'pid')
(2, 'gender_x')
(3, 'wave_x')
(4, 'attr_o_x')
(5, 'sinc_o_x')
(6, 'intel_o_x')
(7, 'fun_o_x')
(8, 'amb_o_x')
(9, 'age_x')
(10, 'field_cd_x')
(11, 'race_x')
(12, 'imprace_x')
(13, 'imprelig_x')
(14, 'date_x')
(15, 'go_out_x')
(16, 'sports_x')
(17, 'tvsports_x')
(18, 'exercise_x')
(19, 'dining_x')
(20, 'museums_x')
(21, 'art_x')
(22, 'hiking_x')
(23, 'gaming_x')
(24, 'clubbing_x')
(25, 'reading_x')
(26, 'tv_x')
(27, 'theater_x')
(28, 'movies_x')
(29, 'concerts_x')
(30, 'music_x')
(31, 'shopping_x')
(32, 'yoga_x')
(33, 'sinc1_1_x')
(34, 'intel1_1_x')
(35, 'fun1_1_x')
(36, 'amb1_1_x')
(37, 'shar1_1_x')
(38, 'gender_y')
(39, 'wave_y')
(40, 'attr_o_y')
(41, 'sinc_o_y')
(42, 'intel_o_y')
(43, 'fun_o_y')
(44, 'amb_o_y')
(45, 'age_y')
(46, 'field_cd_y')
(47, 'race_y')
(48, 'imprace_y')
(49, 'imprelig_y')
(50, 'date_y')
(51, 'go_out_y')
(52, 'sports_y')
(53, 'tvsports_y')
(54, 'exercise_y')
(55, 'dining_y')
(56, 'museums_y')
(57, 'art_y')
(58, 'hiking_y')
(59, 'gaming_y')
(

In [6]:
# =================== Model Data ======================
y = data.pop('match')
iid = data.pop('iid')
pid = data.pop('pid')
wave_x = data.pop('wave_x')
wave_y = data.pop('wave_y')

In [7]:
ori_edges = pd.concat([iid, pid, wave_x, wave_y, y], axis=1, keys=['src', 'dst', 'wave_x', 'wave_y', 'match'])
ori_edges.to_csv('../data/ori_edges.csv',index=False)

In [8]:
x = data.values.astype('float64')
y = y.values.astype('float64')

In [9]:
# ======================== SMOTE Oversampling ========================
if smote:
	print("[INFO] SMOTE Oversampling")
	print("Original Dataset: ", binary_counter(y))	# count of +ve and -ve labels
	sm = SMOTE(random_state = 209)
	x, y = sm.fit_sample(x, y)
	print("SMOTE Resampled Dataset: ", binary_counter(y)) 

[INFO] SMOTE Oversampling
Original Dataset:  {0: 5536, 1: 1150}
SMOTE Resampled Dataset:  {0: 5536, 1: 5536}


In [10]:
print(x.shape)
print(y.shape)

(11072, 72)
(11072,)


In [11]:
# train, test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=299, stratify=y)

In [12]:
# ======== Model 2: Random Forest (objective) ==========

In [13]:
# random forest model
model = RandomForestClassifier()
rf = model.fit(x_train, y_train)
predict_train_rf = rf.predict(x_train)
predict_test_rf = rf.predict(x_test)
print('Train Acc:', metrics.accuracy_score(y_train, predict_train_rf))
print('Test Acc:', metrics.accuracy_score(y_test, predict_test_rf))



Train Acc: 0.9945804897631473
Test Acc: 0.9016245487364621


In [14]:
# ======== Model 3: xgboost (objective) ==========

In [15]:
# xgboost model
model = GradientBoostingClassifier()
xgb = model.fit(x_train, y_train)
predict_train_xgb = xgb.predict(x_train)
predict_test_xgb = xgb.predict(x_test)
print('Train Accuracy:', metrics.accuracy_score(y_train, predict_train_xgb))
print('Test Accuracy:', metrics.accuracy_score(y_test, predict_test_xgb))

Train Accuracy: 0.9055600160578081
Test Accuracy: 0.8898916967509025


In [16]:
count = binary_counter(y_test)

In [17]:
print(count)
print(count[0]/(count[0]+count[1]))

{0: 554, 1: 554}
0.5


In [18]:
# ======= Recommendation =========

In [19]:
pair = pd.read_csv('../data/allpair.csv', encoding="ISO-8859-1")

In [20]:
pair.head(5)

Unnamed: 0,iid,pid,iid_x,gender_x,wave_x,attr_o_x,sinc_o_x,intel_o_x,fun_o_x,amb_o_x,...,concerts_y,music_y,shopping_y,yoga_y,sinc1_1_y,intel1_1_y,fun1_1_y,amb1_1_y,age_diff,samerace
0,0,0,0,0,1,6.7,7.4,8.0,7.2,8.0,...,10.0,9.0,8.0,1.0,20.0,20.0,15.0,15.0,0.0,1
1,0,1,0,0,1,6.7,7.4,8.0,7.2,8.0,...,7.0,8.0,3.0,1.0,5.0,25.0,20.0,0.0,-3.0,0
2,0,2,0,0,1,6.7,7.4,8.0,7.2,8.0,...,7.0,5.0,8.0,7.0,10.0,35.0,10.0,10.0,-4.0,0
3,0,3,0,0,1,6.7,7.4,8.0,7.2,8.0,...,8.0,7.0,1.0,8.0,20.0,20.0,20.0,10.0,-2.0,0
4,0,4,0,0,1,6.7,7.4,8.0,7.2,8.0,...,3.0,7.0,8.0,3.0,5.0,25.0,25.0,10.0,0.0,0


In [21]:
# =================== Model Data ======================
iid = pair.pop('iid')
pid = pair.pop('pid')
wave_x = pair.pop('wave_x')
wave_y = pair.pop('wave_y')
x = pair.values.astype('float64')
print(x.shape)

(287296, 72)


In [22]:
pred = rf.predict(x)
print(pred)

[0. 0. 0. ... 0. 0. 0.]


In [23]:
edges = pd.concat([iid, pid, wave_x, wave_y], axis=1, keys=['src', 'dst', 'wave_x', 'wave_y'])
edges['match'] = pred

In [24]:
edges.head(5)

Unnamed: 0,src,dst,wave_x,wave_y,match
0,0,0,1,1,0.0
1,0,1,1,1,0.0
2,0,2,1,1,0.0
3,0,3,1,1,0.0
4,0,4,1,1,0.0


In [25]:
print(edges.shape)

(287296, 5)


In [26]:
edges.to_csv('../data/edges.csv',index=False)

In [27]:
matches = edges[edges['match'] > 0]

In [28]:
matches.head(5)

Unnamed: 0,src,dst,wave_x,wave_y,match
15,0,15,1,1,1.0
16,0,16,1,1,1.0
17,0,17,1,1,1.0
41,0,41,1,2,1.0
45,0,45,1,2,1.0


In [29]:
print(matches.shape)

(7610, 5)
