In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.utils import shuffle
from sklearn.preprocessing import PolynomialFeatures, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

### CV for old features

In [13]:
train_set = pd.read_csv('data/train_nx_ig_v3.csv')
X_train = train_set.iloc[:,2:-1]
y_train = train_set.iloc[:,-1]

In [14]:
X_train, y_train = shuffle(X_train, y_train, random_state=0)

In [16]:
clf_rf = RandomForestClassifier(n_estimators=50, max_depth=4, min_samples_split=60, min_samples_leaf=50, n_jobs=1)

In [17]:
np.mean(cross_val_score(clf_rf, X_train, y_train, cv=5, scoring='f1'))

0.9908574147073501

In [19]:
clf_rf = RandomForestClassifier(n_estimators=300, max_depth=3, min_samples_split=600, min_samples_leaf=300, n_jobs=1)

In [20]:
np.mean(cross_val_score(clf_rf, X_train, y_train, cv=5, scoring='f1'))

0.9896621490344097

These results are overfitting.

In [23]:
reg_rf = RandomForestRegressor(n_estimators=300, max_depth=3, min_samples_split=600, min_samples_leaf=400, n_jobs=1)
reg_rf.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=3,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=400, min_samples_split=600,
           min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [24]:
y_pred_train = reg_rf.predict(X_train)

In [25]:
reg_train = np.zeros((len(y_train), 2))
reg_train[:, 0] = np.array(y_train)
reg_train[:, 1] = y_pred_train
indice = np.argsort(reg_train[:,1])[::-1]
reg_train = reg_train[indice]
reg_train[:10]

array([[1.        , 0.99543574],
       [1.        , 0.99543574],
       [1.        , 0.99543574],
       [1.        , 0.99543574],
       [1.        , 0.99543574],
       [1.        , 0.99543574],
       [1.        , 0.99543574],
       [1.        , 0.99543574],
       [1.        , 0.99543574],
       [1.        , 0.99543574]])

In [26]:
p_best, r_best, f1_best, ts = 0, 0, 0, 0
num_ones = sum(y_train)
tp = 0
for idx, row in enumerate(reg_train):
    if row[0] == 1:
        tp += 1
    p = tp / (idx + 1)
    r = tp / num_ones
    f1 = 2 * p * r / (p + r)
    if f1 > f1_best:
        p_best, r_best, f1_best = p, r, f1
        ts = row[1]
p_best, r_best, f1_best, ts

(0.981732816413192,
 0.9989108704084982,
 0.9902473510776129,
 0.24184651587323322)

In [27]:
y_test_pred = reg_rf.predict(X_test)
y_test_pred = y_test_pred > ts
y_test_pred = y_test_pred.astype(int)
y_test_pred

array([0, 1, 1, ..., 0, 0, 1])

### Train, predict and output

In [15]:
test_set = pd.read_csv('data/test_nx_ig_v3.csv')
X_test = test_set.iloc[:,2:]

In [8]:
clf_rf = RandomForestClassifier(n_estimators=100, max_depth=6, min_samples_split=30, min_samples_leaf=10, n_jobs=1)
# clf_rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=6, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=10, min_samples_split=30,
            min_weight_fraction_leaf=0.0, n_estimators=15, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [None]:
# y_test = clf_rf.predict(X_test)

In [28]:
df = pd.DataFrame(y_test_pred, columns=['category'])
df.index.name = 'id'
df.to_csv('result/rf_reg3.csv', index=True, header=True)

### with new features of Shiwen

In [22]:
train_set = pd.read_csv('data/train_treated.csv')
train_set.head()

Unnamed: 0,id1,id2,link,rno1,rno2,sim,year1,year2,year_diff,common_authors,cn,aai,title_overlap
0,9510123,9502114,1,16827,15446,0.064373,1995,1995,0,0,1,0.513898,2
1,9707075,9604178,1,21154,18059,0.021211,1997,1996,1,0,20,4.320366,1
2,9312155,9506142,0,13074,16171,0.017202,1993,1995,-2,0,0,0.0,0
3,9911255,302165,0,27486,9702,0.012634,1999,2003,-4,0,0,0.0,0
4,9701033,209076,0,19856,8212,0.059588,1997,2002,-5,0,0,0.0,0


In [23]:
X_train = train_set.loc[:,['sim', 'year_diff', 'common_authors', 'cn', 'aai', 'title_overlap']]
y_train = train_set.loc[:,'link']
X_train.head()

Unnamed: 0,sim,year_diff,common_authors,cn,aai,title_overlap
0,0.064373,0,0,1,0.513898,2
1,0.021211,1,0,20,4.320366,1
2,0.017202,-2,0,0,0.0,0
3,0.012634,-4,0,0,0.0,0
4,0.059588,-5,0,0,0.0,0


In [24]:
X_train, y_train = shuffle(X_train, y_train, random_state=0)
X_train.head()

Unnamed: 0,sim,year_diff,common_authors,cn,aai,title_overlap
206307,0.057576,-1,0,0,0.0,1
545369,0.064932,6,0,15,3.795183,0
304493,0.009646,0,0,0,0.0,0
42171,0.00213,-2,0,0,0.0,0
268996,0.095356,0,0,0,0.0,0


In [25]:
clf_rf = RandomForestClassifier(n_estimators=100, max_depth=6, min_samples_split=30, min_samples_leaf=10, n_jobs=1)
cross_val_score(clf_rf, X_train, y_train, cv=5, scoring='f1').mean()

0.9725118397484275

In [26]:
clf_rf = RandomForestClassifier(n_estimators=100, max_depth=3, min_samples_split=30, min_samples_leaf=10, n_jobs=1)
cross_val_score(clf_rf, X_train, y_train, cv=5, scoring='f1').mean()

0.9673684632518584

In [27]:
clf_rf = RandomForestClassifier(n_estimators=100, max_depth=3, min_samples_split=300, min_samples_leaf=100, n_jobs=1)
cross_val_score(clf_rf, X_train, y_train, cv=5, scoring='f1').mean()

0.9674755951776779

In [28]:
clf_rf = RandomForestClassifier(n_estimators=100, max_depth=3, min_samples_split=600, min_samples_leaf=200, n_jobs=1)
cross_val_score(clf_rf, X_train, y_train, cv=5, scoring='f1').mean()

0.9675669593503844

In [29]:
clf_rf = RandomForestClassifier(n_estimators=300, max_depth=3, min_samples_split=600, min_samples_leaf=200, n_jobs=1)
cross_val_score(clf_rf, X_train, y_train, cv=5, scoring='f1').mean()

0.9681502188745899

### replace 5 old features with 5 new ones

In [30]:
train1 = pd.read_csv('data/train_treated.csv')
train1.head()

Unnamed: 0,id1,id2,link,rno1,rno2,sim,year1,year2,year_diff,common_authors,cn,aai,title_overlap
0,9510123,9502114,1,16827,15446,0.064373,1995,1995,0,0,1,0.513898,2
1,9707075,9604178,1,21154,18059,0.021211,1997,1996,1,0,20,4.320366,1
2,9312155,9506142,0,13074,16171,0.017202,1993,1995,-2,0,0,0.0,0
3,9911255,302165,0,27486,9702,0.012634,1999,2003,-4,0,0,0.0,0
4,9701033,209076,0,19856,8212,0.059588,1997,2002,-5,0,0,0.0,0


In [31]:
train2 = pd.read_csv('data/train_nx_ig_v3.csv')
train2.head()

Unnamed: 0,id1,id2,res_allo_ind,overlap_title,temp_diff,comm_auth,reduced_tfidf_sim,jaccard_coeff,tgt_citation,labels
0,9510123,9502114,0.142857,2,0,0,0.242161,0.176471,8,1
1,9707075,9604178,0.226401,1,1,0,0.11674,0.106796,124,1
2,9312155,9506142,0.0,0,-2,0,0.067705,0.0,2,0
3,9911255,302165,0.0,0,-4,0,0.054771,0.0,2,0
4,9701033,209076,0.0,0,-5,0,0.220903,0.0,2,0


In [32]:
X_train = pd.concat([train1.loc[:,['title_overlap', 'year_diff','common_authors','sim','cn','aai']], train2.loc[:,['jaccard_coeff','tgt_citation']]], axis=1)
y_train = train1.loc[:,'link']
X_train.head()

Unnamed: 0,title_overlap,year_diff,common_authors,sim,cn,aai,jaccard_coeff,tgt_citation
0,2,0,0,0.064373,1,0.513898,0.176471,8
1,1,1,0,0.021211,20,4.320366,0.106796,124
2,0,-2,0,0.017202,0,0.0,0.0,2
3,0,-4,0,0.012634,0,0.0,0.0,2
4,0,-5,0,0.059588,0,0.0,0.0,2


In [33]:
X_train, y_train = shuffle(X_train, y_train, random_state=0)
X_train.head()

Unnamed: 0,title_overlap,year_diff,common_authors,sim,cn,aai,jaccard_coeff,tgt_citation
206307,1,-1,0,0.057576,0,0.0,0.0,8
545369,0,6,0,0.064932,15,3.795183,0.079439,164
304493,0,0,0,0.009646,0,0.0,0.0,0
42171,0,-2,0,0.00213,0,0.0,0.0,2
268996,0,0,0,0.095356,0,0.0,0.030303,14


In [34]:
clf_rf = RandomForestClassifier(n_estimators=100, max_depth=6, min_samples_split=30, min_samples_leaf=10, n_jobs=1)
cross_val_score(clf_rf, X_train, y_train, cv=5, scoring='f1').mean()

0.992933903795711

In [35]:
clf_rf = RandomForestClassifier(n_estimators=100, max_depth=3, min_samples_split=30, min_samples_leaf=10, n_jobs=1)
cross_val_score(clf_rf, X_train, y_train, cv=5, scoring='f1').mean()

0.9884021101290539

In [36]:
clf_rf = RandomForestClassifier(n_estimators=100, max_depth=3, min_samples_split=300, min_samples_leaf=100, n_jobs=1)
cross_val_score(clf_rf, X_train, y_train, cv=5, scoring='f1').mean()

0.9880748876882601

In [37]:
clf_rf = RandomForestClassifier(n_estimators=100, max_depth=3, min_samples_split=600, min_samples_leaf=200, n_jobs=1)
cross_val_score(clf_rf, X_train, y_train, cv=5, scoring='f1').mean()

0.9871544415795503

In [38]:
clf_rf = RandomForestClassifier(n_estimators=300, max_depth=3, min_samples_split=600, min_samples_leaf=200, n_jobs=1)
cross_val_score(clf_rf, X_train, y_train, cv=5, scoring='f1').mean()

0.9880966772338258

These results are overfitting.

### Trials of features

#### 6 new features + tgt citation

In [2]:
train1 = pd.read_csv('data/train_treated.csv')
train2 = pd.read_csv('data/train_nx_ig_v2.csv')
X_train = pd.concat([train1.loc[:,['title_overlap', 'year_diff','common_authors','sim','cn','aai']], train2.loc[:,['tgt_citation']]], axis=1)
y_train = train1.loc[:,'link']
X_train, y_train = shuffle(X_train, y_train, random_state=0)

In [4]:
params_range = {'n_estimators':[100,200,300,400,500,600], 'max_depth':[3,4], 'min_samples_split':[50,100,200,300,400]}
clf_rf = RandomForestClassifier(warm_start=True, n_jobs=2, random_state=0)
clf = GridSearchCV(clf_rf, param_grid=params_range, scoring='f1', cv=5)
clf.fit(X_train,y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=2,
            oob_score=False, random_state=0, verbose=0, warm_start=True),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': [100, 200, 300, 400, 500, 600], 'min_samples_split': [50, 100, 200, 300, 400], 'max_depth': [3, 4]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='f1', verbose=0)

In [7]:
print (clf.best_score_)
print (clf.best_estimator_)

0.9698362327686516
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=4, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=200,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=2,
            oob_score=False, random_state=0, verbose=0, warm_start=True)


In [10]:
clf_rf = RandomForestClassifier(n_estimators=100, max_depth=5, min_samples_split=200, min_samples_leaf=50, n_jobs=1)
cross_val_score(clf_rf, X_train, y_train, cv=5, scoring='f1').mean()

0.972397257457392

In [12]:
clf_rf = RandomForestClassifier(n_estimators=100, max_depth=5, n_jobs=-1)
cross_val_score(clf_rf, X_train, y_train, cv=5, scoring='f1').mean()

0.972594618211903

In [14]:
clf_rf = RandomForestClassifier(n_estimators=100, max_depth=6, n_jobs=-1)
cross_val_score(clf_rf, X_train, y_train, cv=5, scoring='f1').mean()

0.9733753835889626

In [15]:
clf_rf = RandomForestClassifier(n_estimators=100, max_depth=10, n_jobs=-1)
cross_val_score(clf_rf, X_train, y_train, cv=5, scoring='f1').mean()

0.9745350779254398

The results above which has max_depth > 4 are more overfitting than the best parameter.

In [16]:
clf_rf = RandomForestClassifier(n_estimators=100, max_depth=5, n_jobs=-1)
clf_rf.fit(X_train, y_train)

test1 = pd.read_csv('data/test_treated.csv')
test2 = pd.read_csv('data/test_nx_ig_v2.csv')
X_test = pd.concat([test1.loc[:,['title_overlap','year_diff','common_authors','sim','cn','aai']], test2.loc[:,['tgt_citation']]], axis=1)

y_test_pred = clf_rf.predict(X_test)
df = pd.DataFrame(y_test_pred, columns=['category'])
df.index.name = 'id'
df.to_csv('result/rf_6new+tgt_1.csv', index=True, header=True)

In [17]:
reg_rf = RandomForestRegressor(n_estimators=100, max_depth=4, min_samples_split=200, n_jobs=1)
reg_rf.fit(X_train, y_train)
y_train_pred = reg_rf.predict(X_train)
reg_train = np.zeros((len(y_train), 2))
reg_train[:, 0] = np.array(y_train)
reg_train[:, 1] = y_train_pred
indice = np.argsort(reg_train[:,1])[::-1]
reg_train = reg_train[indice]
p_best, r_best, f1_best, ts = 0, 0, 0, 0
num_ones = sum(y_train)
tp = 0
for idx, row in enumerate(reg_train):
    if row[0] == 1:
        tp += 1
    p = tp / (idx + 1)
    r = tp / num_ones
    f1 = 2 * p * r / (p + r)
    if f1 > f1_best:
        p_best, r_best, f1_best = p, r, f1
        ts = row[1]
print (p_best, r_best, f1_best, ts)

test1 = pd.read_csv('data/test_treated.csv')
test2 = pd.read_csv('data/test_nx_ig_v2.csv')
X_test = pd.concat([test1.loc[:,['title_overlap','year_diff','common_authors','sim','cn','aai']], test2.loc[:,['tgt_citation']]], axis=1)

y_test_pred = reg_rf.predict(X_test)
y_test_pred = y_test_pred > ts
y_test_pred = y_test_pred.astype(int)

df = pd.DataFrame(y_test_pred, columns=['category'])
df.index.name = 'id'
df.to_csv('result/rf_6new+tgt_reg.csv', index=True, header=True)

0.9819538281421498 0.9610389997911258 0.9713838479193875 0.5407360871850211


#### new new features

In [4]:
train1 = pd.read_csv('../data/train_treated_with_journal_w2v.csv')
train2 = pd.read_csv('../data/train_nx_ig_v2.csv')
X_train = pd.concat([train1.loc[:,['sim','cn','aai','year_diff','common_authors','title_overlap','journal_overlap','wmd']], train2.loc[:,['res_allo_ind','tgt_citation','jaccard_nx']]], axis=1)
y_train = train1.loc[:,'link']
X_train_cv, y_train_cv = shuffle(X_train, y_train, random_state=0)

In [5]:
clf_rf = RandomForestClassifier(n_estimators=100, max_depth=5, n_jobs=2)
cross_val_score(clf_rf, X_train_cv, y_train+cv, cv=5, scoring='f1').mean()

0.9727744696757761

In [6]:
reg_rf = RandomForestRegressor(n_estimators=100, max_depth=5, min_samples_split=200, n_jobs=2)
reg_rf.fit(X_train, y_train)
y_train_pred = reg_rf.predict(X_train)
reg_train = np.zeros((len(y_train), 2))
reg_train[:, 0] = np.array(y_train)
reg_train[:, 1] = y_train_pred
indice = np.argsort(reg_train[:,1])[::-1]
reg_train = reg_train[indice]
p_best, r_best, f1_best, ts = 0, 0, 0, 0
num_ones = sum(y_train)
tp = 0
for idx, row in enumerate(reg_train):
    if row[0] == 1:
        tp += 1
    p = tp / (idx + 1)
    r = tp / num_ones
    f1 = 2 * p * r / (p + r)
    if f1 > f1_best:
        p_best, r_best, f1_best = p, r, f1
        ts = row[1]
print (p_best, r_best, f1_best, ts)

0.9801911701619871 0.9681526571778116 0.9741347216217514 0.28222400492706284


### to stack

In [7]:
train1 = pd.read_csv('../data/train_treated_with_journal_w2v.csv')
train2 = pd.read_csv('../data/train_nx_ig_v2.csv')
X_train = pd.concat([train1.loc[:,['sim','cn','aai','year_diff','common_authors','title_overlap','journal_overlap','wmd']], train2.loc[:,['res_allo_ind','tgt_citation','jaccard_nx']]], axis=1)
y_train = train1.loc[:,'link']

In [8]:
reg_rf = RandomForestRegressor(n_estimators=100, max_depth=5, min_samples_split=200, n_jobs=2)
reg_rf.fit(X_train, y_train)
y_train_pred = reg_rf.predict(X_train)
reg_train = np.zeros((len(y_train), 2))
reg_train[:, 0] = np.array(y_train)
reg_train[:, 1] = y_train_pred
indice = np.argsort(reg_train[:,1])[::-1]
reg_train = reg_train[indice]
p_best, r_best, f1_best, ts = 0, 0, 0, 0
num_ones = sum(y_train)
tp = 0
for idx, row in enumerate(reg_train):
    if row[0] == 1:
        tp += 1
    p = tp / (idx + 1)
    r = tp / num_ones
    f1 = 2 * p * r / (p + r)
    if f1 > f1_best:
        p_best, r_best, f1_best = p, r, f1
        ts = row[1]
print (p_best, r_best, f1_best, ts)

0.9803432080706483 0.9679049920926208 0.9740843953826381 0.29781174511414493


In [9]:
y_train_pred = y_train_pred > ts
y_train_pred = y_train_pred.astype(int)
train1['rf_stack'] = y_train_pred
train1.to_csv('../data/train_stacked.csv', index=False)

In [10]:
test1 = pd.read_csv('../data/test_treated_with_journal_w2v.csv')
test2 = pd.read_csv('../data/test_nx_ig_v2.csv')
X_test = pd.concat([test1.loc[:,['sim','cn','aai','year_diff','common_authors','title_overlap','journal_overlap','wmd']], test2.loc[:,['res_allo_ind','tgt_citation','jaccard_nx']]], axis=1)

In [11]:
y_test_pred = reg_rf.predict(X_test)
y_test_pred = y_test_pred > ts
y_test_pred = y_test_pred.astype(int)
test1['rf_stack'] = y_test_pred
test1.to_csv('../data/test_stacked.csv', index=False)

df = pd.DataFrame(y_test_pred, columns=['category'])
df.index.name = 'id'
df.to_csv('../result/rf_new_reg.csv', index=True, header=True)