In [1]:
import numpy as np
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, classification_report, confusion_matrix
import math
from sklearn.preprocessing import MinMaxScaler

### Load data

In [2]:
# load datasets
domain1 = pd.read_json('data/domain1_train.json', lines=True)
domain2 = pd.read_json('data/domain2_train.json', lines=True)
test = pd.read_json('data/test_set.json', lines=True)

In [3]:
# random sample without replacement from label 0
# label_0_rate: the rate of label 0 in the sampled dataframe, can be any number

def under_sample(df, label_0_rate):

    sampled_label_1 = df[df['label'] == 1]
    label_1_count = len(df[df['label'] == 1])

    #calculate the number of label 0 samples
    sampled_label_0_count = int(label_0_rate * label_1_count)

    total_label_0 = df[df['label'] == 0]

    # calculate the number needed for each model
    model_count = sampled_label_0_count // 7
    sampled_label_0 = pd.DataFrame()

    # use for loop to sample from each model
    for i in range(7):
        model = total_label_0[total_label_0['model'] == i]
        sampled_model = model.sample(n=model_count, replace=False)
        sampled_label_0 = pd.concat([sampled_label_0, sampled_model])
    
    # domain2_machine = train_data2[train_data2['label'] == 8].sample(4388)
    # domain2_human = train_data2[train_data2['label'] == 1].sample(4300, replace = True)
    # print(len(doamin2_machine), len(domain2_human))
    # train_data = pd.concat(train_datal, domain2_machine[['text,'label']],domain2 human[['text','label']]]).sample(frac = 1)

    # concatenate the sampled label 0 and sampled label 1
    sampled_df = pd.concat([sampled_label_1, sampled_label_0])
    
    return sampled_df

In [4]:
## Bootstrap human instances in domain2
# %run -i explore.ipynb
newdomain2 = under_sample(domain2,1)
newdomain2 = newdomain2.sort_index()

In [5]:
# split data for validation
train_domain1, valid_domain1 = train_test_split(domain1,test_size=0.2,random_state=12)
train_domain2, valid_domain2 = train_test_split(newdomain2,test_size=0.2,random_state=12) 

### Bag of words

In [6]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import GridSearchCV
from joblib import dump,load

In [7]:
def vectorizer(method, train_df,valid_df, ngram, max_fea):
    if method == 'count':
        vectorizer = CountVectorizer(ngram_range=ngram, max_features=max_fea, min_df=2)
    else:
        vectorizer = TfidfVectorizer(ngram_range=ngram, max_features=max_fea)

    ## transform train domain1's text to string 
    corpus = [' '.join(str(word_id) for word_id in document) for document in train_df['text']]
    # learn the token dictionary and return document-term matrix
    df_vec = vectorizer.fit_transform(corpus)
    # get vectorized data
    train_X = df_vec.toarray()

    ## transform test domain1's text to string 
    corpus_test = [' '.join(str(word_id) for word_id in document) for document in valid_df['text']]
    valid_X = vectorizer.transform(corpus_test)
    valid_X.shape

    return train_X, valid_X, vectorizer

In [8]:
def LR(train_df, train_y, max_iter, pen, c, sam_solver):
    lr = LogisticRegression(max_iter=max_iter, penalty=pen, C=c, solver=sam_solver)
    ## Logistic regression in domain 1
    lr.fit(train_df,train_y)
    return lr 

In [9]:
def print_score(lr,train_df,train_y,valid_df,valid_y):
    ## Accuracy
    print("training Accuracy: {:.4f}".format(lr.score(train_df,train_y)))
    print("valid Accuracy: {:.4f}".format(lr.score(valid_df,valid_y)))
    ## F1 score 
    print("Train F1 score: {:.4f}".format(metrics.f1_score(train_y,lr.predict(train_df),average='macro')))
    print("Test F1 score: {:.4f}".format(metrics.f1_score(valid_y,lr.predict(valid_df),average='macro')))

In [10]:
def comb_lrs(train_domain1,train_domain2,valid_domain1,valid_domain2,
             dm1_ngram,dm1_max_fea,dm1_max_iter,dm1_pen, dm1_c, dm1_sol,
             dm2_ngram,dm2_max_fea,dm2_max_iter,dm2_pen, dm2_c, dm2_sol,
             tt_ngram,tt_max_fea,tt_max_iter,tt_pen, tt_c, tt_sol
             ):
    ## domain 1 do LR
    train_domain1_X, valid_domain1_X, v1 = vectorizer('count',train_domain1,valid_domain1,ngram=dm1_ngram,max_fea=dm1_max_fea)
    lr1 = LR(train_domain1_X,train_domain1['label'],dm1_max_iter,dm1_pen,dm1_c,dm1_sol)
    print("Domain1 LR")
    print_score(lr1,train_domain1_X,train_domain1['label'],valid_domain1_X,valid_domain1['label'])
    
    ## domain 2 do LR
    train_domain2_X, valid_domain2_X, v2 = vectorizer('count',train_domain2,valid_domain2,ngram=dm2_ngram,max_fea=dm2_max_fea)
    lr2 = LR(train_domain2_X,train_domain2['label'], dm2_max_iter ,dm2_pen, dm2_c,dm2_sol)
    print("Domain2 LR")
    print_score(lr2,train_domain2_X,train_domain2['label'],valid_domain2_X,valid_domain2['label'])

    ## combine dm1 dm2 datasets
    total_train = pd.concat([train_domain1,train_domain2],axis=0,ignore_index=True)
    total_valid = pd.concat([valid_domain1,valid_domain2],axis=0,ignore_index=True)
    train_tt_X, valid_tt_X, v_tt = vectorizer('count',total_train,total_valid,ngram=tt_ngram,max_fea=tt_max_fea)

    ## total dataset do LR
    lr_tt = LogisticRegression(max_iter=tt_max_iter, penalty=tt_pen, C=tt_c, solver=tt_sol,class_weight='balanced')
    lr_tt.fit(train_tt_X,total_train['label'])

    print("Total LR")
    print_score(lr_tt,train_tt_X,total_train['label'],valid_tt_X,total_valid['label'])
    
    return v1, v2, v_tt, lr1, lr2, lr_tt, total_train,total_valid

In [11]:
def voting(v1,v2,v3,lr_1,lr_2,lr_total,tt_valid):
    corpus_tt_valid = [' '.join(str(word_id) for word_id in document) for document in tt_valid['text']]
    
    valid_md1 = v1.transform(corpus_tt_valid)
    valid_md2 = v2.transform(corpus_tt_valid)
    valid_md3 = v3.transform(corpus_tt_valid)
    pred_md1 = lr_1.predict(valid_md1)
    pred_md2 =  lr_2.predict(valid_md2)
    pred_md3 =  lr_total.predict(valid_md3)

    pred_voting = pd.DataFrame({'model1' : np.array(pred_md1),
                            'model2' : np.array(pred_md2),
                           'model3' : np.array(pred_md3)
                            })
    pred_voting['voting'] = pred_voting.mode(axis=1)
    pred_voting = pd.concat([pred_voting,tt_valid['label']],axis=1)
    print("Voting Valid Accuracy {:.4f}".format(
    (pred_voting['voting'] == pred_voting['label']).sum()/len(pred_voting)))
    print("Voting Valid F1 score {:.4f}".format(
    format(metrics.f1_score(pred_voting['label'],pred_voting['voting'],average='macro'))))

In [12]:
scaler = MinMaxScaler()

In [16]:
def stacking(total_valid,v1,v2,v_tt,lr1,lr2,lr_total):
    corpus_total_valid = [' '.join(str(word_id) for word_id in document) for document in total_valid['text']]

    total_valid_length = pd.DataFrame(total_valid['text'].apply(lambda x: len(x)))
    total_valid_length.rename(columns={'text':'length'},inplace=True)
    total_valid_length = scaler.transform(total_valid_length)

    total_valid_length = total_valid_length.values

    ## transfer data 
    valid_md1 = v1.transform(corpus_total_valid)
    valid_md2 = v2.transform(corpus_total_valid)
    valid_md3 = v_tt.transform(corpus_total_valid)

    ## get prediction in each lr
    pred_md1 = lr1.predict(valid_md1)
    pred_md2 = (lr2.predict_proba(valid_md2)[:,1] >= 0.95).astype(int)    
    pred_md3 =  lr_total.predict(valid_md3)

    ## using the stacking model to test validation set
    X_val_meta = np.column_stack((pred_md1,pred_md2, pred_md3))

    # Train the meta-model on the combined feature matrix and the target values
    meta_model = LogisticRegressionCV(cv = 5,random_state=12)
    meta_model.fit(X_val_meta, total_valid['label'])

    y_val_meta = meta_model.predict(X_val_meta)
    y_val_meta = pd.DataFrame(y_val_meta, columns=['meta'])

    pred_stacking = pd.DataFrame()
    pred_stacking['stacking'] = y_val_meta['meta'].apply(lambda x: 0 if x < 0.5 else 1)

    pred_stacking = pd.concat([pred_stacking,total_valid['label']],axis=1)

    print("Stacking valid accuracy {}".format((pred_stacking['stacking'] == pred_stacking['label']).sum()/len(pred_stacking)))
    print("Stacking valid F1 score {}".format(metrics.f1_score(pred_stacking['label'],pred_stacking['stacking'],average='macro')))
    return meta_model

In [15]:
def test_ensemble(v1,v2,v_tt,lr1,lr2,lr_total,meta_model):

    corpus_test = [' '.join(str(word_id) for word_id in document) for document in test['text']]
    
    test_length = pd.DataFrame(test['text'].apply(lambda x: len(x)))
    test_length.rename(columns={'text':'length'},inplace=True)
    test_length = scaler.transform(test_length)

    test_length = test_length.values


    test_md1 = v1.transform(corpus_test)
    test_md2 = v2.transform(corpus_test)
    test_md3 = v_tt.transform(corpus_test)

    ptest_md1 = lr1.predict(test_md1)
    ptest_md2 = lr2.predict(test_md2)
    ptest_md3 = lr_total.predict(test_md3)

    X_test_meta = np.column_stack((ptest_md1,ptest_md2, ptest_md3))

    y_test_meta = meta_model.predict(X_test_meta)

    # majority voting
    pre = pd.DataFrame({'model1' : np.array(ptest_md1),
                            'model2' : np.array(ptest_md2),
                           'model3' : np.array(ptest_md3)
                            })
   # pre['voting'] = pre.mode(axis=1)
    pre['stacking'] = y_test_meta
    return pre

In [14]:
## Useless
def process_LR(train_df,valid_df,ngram,max_fea,max_iter,pen,C,solver):
    train_X, valid_X, vec = vectorizer('count',train_df,valid_df,ngram=ngram,max_fea=max_fea)
    lr = LR(train_X,train_df['label'],max_iter,pen,C,solver)
    print_score(lr,train_X,train_df['label'],valid_X,valid_df['label'])
    return lr, vec

In [15]:
default_pen = 'l2'
default_C = 1.0
default_solver = 'lbfgs'

In [17]:
v1, v2, v_tt, lr1, lr2, lr_tt,total_train,total_valid = comb_lrs(train_domain1, train_domain2, valid_domain1, valid_domain2,
         (1,2),20000,150,'l2',1.0,default_solver,
          (1,2),20000,400,'l2',1.0,default_solver,
           (1,2),20000,300,'l2',1.0,default_solver)

NameError: name 'default_solver' is not defined

In [90]:
train_domain2.model.value_counts()

2.0    246
4.0    245
3.0    242
6.0    242
1.0    242
0.0    237
5.0    230
Name: model, dtype: int64

In [95]:
valid_domain2_2 = valid_domain2[valid_domain2['model']==2.0]
train_domain2_X, valid_domain2_X, v1 = vectorizer('count',train_domain2,valid_domain2_2,ngram=(1,2),max_fea=20000)
print("valid Accuracy: {:.4f}".format(lr2.score(valid_domain2_X,valid_domain2_2['label'])))
print("Test F1 score: {:.4f}".format(metrics.f1_score(valid_domain2_2['label'],lr2.predict(valid_domain2_X),average='macro')))

valid Accuracy: 0.3934
Test F1 score: 0.2824


In [96]:
valid_domain2.model.value_counts()

5.0    77
0.0    70
3.0    65
6.0    65
1.0    65
4.0    62
2.0    61
Name: model, dtype: int64

In [98]:
valid_domain2_3 = valid_domain2[valid_domain2['model']==1.0]
train_domain2_X, valid_domain2_X, v1 = vectorizer('count',train_domain2,valid_domain2_3,ngram=(1,2),max_fea=20000)
print("valid Accuracy: {:.4f}".format(lr2.score(valid_domain2_X,valid_domain2_3['label'])))
print("Test F1 score: {:.4f}".format(metrics.f1_score(valid_domain2_3['label'],lr2.predict(valid_domain2_X),average='macro')))

valid Accuracy: 0.6615
Test F1 score: 0.3981


In [44]:
## combine dm1 dm2 datasets
total_train = pd.concat([train_domain1,train_domain2],axis=0,ignore_index=True)
total_valid = pd.concat([valid_domain1,valid_domain2],axis=0,ignore_index=True)
train_tt_X, valid_tt_X, v_tt = vectorizer('count',total_train,total_valid,ngram=(1,2),max_fea=20000)

In [47]:
train_domain1_X, valid_domain2_X, v1 = vectorizer('count',train_domain1,total_valid,ngram=(1,2),max_fea=20000)
lr1 = LR(train_domain1_X,train_domain1['label'],150,'l2',1.0,default_solver)
print("Domain1 Model Valid_domain2 LR")
print_score(lr1,train_domain1_X,train_domain1['label'],valid_domain2_X,total_valid['label'])

Domain1 Model Valid_domain2 LR
training Accuracy: 0.9892
valid Accuracy: 0.8441
Train F1 score: 0.9892
Test F1 score: 0.8431


In [88]:
def call_stacking():
    stacking_model = stacking(total_valid,v1,v2,v_tt,lr1,lr2,lr_tt)
    test_pred = test_ensemble(v1,v2,v_tt,lr1,lr2,lr_tt,stacking_model)
    return test_pred

In [102]:
stacking_model = stacking(total_valid,v1,v2,v_tt,lr1,lr2,lr_tt)

Stacking valid accuracy 0.8586134453781512
Stacking valid F1 score 0.8583901894267935


In [83]:
stacking_model.coef_[0]

array([ 0.07769313, -0.00181605,  0.08108293])

In [73]:
test_pred = call_stacking()

NotFittedError: This MinMaxScaler instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [35]:
voting(v1,v2,v_tt,lr1,lr2,lr_tt,total_valid)

Voting Valid Accuracy 0.8553


ValueError: Unknown format code 'f' for object of type 'str'

In [25]:
test_pred.head()

Unnamed: 0,model1,model2,model3,voting,stacking
0,1,1,1,1,1
1,1,0,0,0,0
2,1,1,1,1,1
3,0,0,0,0,0
4,0,1,0,0,0


In [26]:
df = pd.DataFrame({'id': range(len(test)), 'class': test_pred['voting']})

In [27]:
df.to_csv('data/max_iter_15050100_Undsamp_(1,2)_voting_20000_LR_.csv', index=False)

### Gridsearch

In [None]:
parameters = {
    'penalty': ['l1', 'l2'],  # or 'elasticnet' if using both L1 and L2 penalties
    'C'       : np.logspace(-3,3,7),
    'solver': [ 'saga']  # 'liblinear' for small datasets, 'saga' for large datasets
}

def LR(train_df, train_y,iter):
    lr = LogisticRegression(max_iter=iter)
    clf = GridSearchCV(lr, param_grid=parameters, scoring='accuracy',cv=5)
    clf.fit(train_df,train_y)
    return lr 

def comb_lrs(train_domain1,train_domain2,valid_domain1,valid_domain2,
             dm1_ngram,dm1_max_fea,dm1_iter,
             dm2_ngram,dm2_max_fea,dm2_iter,
             tt_ngram,tt_max_fea,tt_iter
             ):
    ## domain 1 do LR
    train_domain1_X, valid_domain1_X, v1 = vectorizer('count',train_domain1,valid_domain1,ngram=dm1_ngram,max_fea=dm1_max_fea)
    lr1 = LR(train_domain1_X,train_domain1['label'],dm1_iter)
    print("Domain1 LR")
    print_score(lr1,train_domain1_X,train_domain1['label'],valid_domain1_X,valid_domain1['label'])
    
    ## domain 2 do LR
    train_domain2_X, valid_domain2_X, v2 = vectorizer('count',train_domain2,valid_domain2,ngram=dm2_ngram,max_fea=dm2_max_fea)
    lr2 = LR(train_domain2_X,train_domain2['label'],dm2_iter)
    print("Domain2 LR")
    print_score(lr2,train_domain2_X,train_domain2['label'],valid_domain2_X,valid_domain2['label'])

    ## combine dm1 dm2 datasets
    total_train = pd.concat([train_domain1,train_domain2],axis=0,ignore_index=True)
    total_valid = pd.concat([valid_domain1,valid_domain2],axis=0,ignore_index=True)
    train_tt_X, valid_tt_X, v_tt = vectorizer('count',total_train,total_valid,ngram=tt_ngram,max_fea=tt_max_fea)

    ## total dataset do LR
    lr_tt = LR(train_tt_X,total_train['label'])
    print("Total LR")
    print_score(lr_tt,train_tt_X,total_train['label'],valid_tt_X,total_valid['label'])
    
    return v1, v2, v_tt, lr1, lr2, lr_tt, total_train,total_valid

### First oversample

In [24]:
# random sample with replacement from label 1
# label_1_rate: the rate of label 1 in the sampled dataframe, should be between 0 and 1

def rated_sample(df, label_1_rate):

    sampled_label_0 = df[df['label'] == 0]
    label_0_count = len(df[df['label'] == 0])

    #calculate the number of label 1 samples
    sampled_label_1_count = int(label_1_rate * label_0_count)
    
    # random sample with replacement from label 1
    sampled_label_1 = df[df['label'] == 1].sample(n=sampled_label_1_count, replace=True)

    # concatenate the sampled label 0 and sampled label 1
    sampled_df = pd.concat([sampled_label_0, sampled_label_1])
    
    return sampled_df

In [27]:
over_dm2 = rated_sample(domain2,1)
over_dm2.label.value_counts()

0    12750
1    12750
Name: label, dtype: int64