In [1]:
# Required Python Packages
import xgboost
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

DATA_DIR = '/home/mag/Extra_Disk_1TB/raw_mag_files/2018-03-06/'
CHUNK_SIZE = 100000

In [2]:
HEADERS = ["FIRST_AUTHOR_RANK", "FIRST_AUTHOR_PAPER_COUNT", "FIRST_AUTHOR_CITE_COUNT", "LAST_AUTHOR_RANK", "LAST_AUTHOR_PAPER_COUNT", "LAST_AUTHOR_CITE_COUNT", "LAST_AUTHOR_ORG_RANK", "LAST_AUTHOR_ORG_PAPER_COUNT", "LAST_AUTHOR_ORG_CITE_COUNT", "CON_SERIES_RANK", "CON_SERIES_PAPER_COUNT", "CON_SERIES_CITE_COUNT", "CON_INSTANCE_RANK", "CON_INSTANCE_PAPER_COUNT", "CON_INSTANCE_CITE_COUNT", "FOS_ID_HIGHEST", "FOS_LEVEL_HIGHEST", "FOS_PAPER_COUNT_HIGHEST", "FOS_CITE_COUNT_HIGHEST", "AGE", "CITED_CLASS"]

def split_dataset(dataset, train_percentage, feature_headers, target_header):
    """
    Split the dataset with train_percentage
    :param dataset:
    :param train_percentage:
    :param feature_headers:
    :param target_header:
    :return: train_x, test_x, train_y, test_y
    """

    # Split dataset into train and test dataset
    train_x, test_x, train_y, test_y = train_test_split(dataset[feature_headers], dataset[target_header],
                                                        train_size=train_percentage)
    return train_x, test_x, train_y, test_y


def main():
    """
    Main function
    :return:
    """
    # Load the csv file into pandas dataframe
    df_reader = pd.read_csv(DATA_DIR + 'Con_Data_1m_Cited_Sample.txt', dtype='float' , sep='\t', error_bad_lines=False, chunksize=CHUNK_SIZE)
    dataset = pd.concat(df_reader, ignore_index=True)


    train_x, test_x, train_y, test_y = split_dataset(dataset, 0.7, HEADERS[0:19], HEADERS[20])

    # Train and Test dataset size details
    print ("Train_x Shape :: ", train_x.shape)
    print ("Train_y Shape :: ", train_y.shape)
    print ("Test_x Shape :: ", test_x.shape)
    print ("Test_y Shape :: ", test_y.shape)

    
    # Create random forest classifier instance
    trained_model = XGBClassifier(n_estimators=1000)
    trained_model.fit(train_x, train_y)
    print(trained_model)
    predictions = trained_model.predict(test_x)

    # Train and Test Accuracy
    print ("Train Accuracy :: ", accuracy_score(train_y, trained_model.predict(train_x)))
    print ("Test Accuracy  :: ", accuracy_score(test_y, predictions))
    print (" Confusion matrix ", confusion_matrix(test_y, predictions))
    
    # Printing Feature Importance
    print ("\nFeature Importance :: \n", pd.DataFrame(trained_model.feature_importances_, index = train_x.columns, columns=['importance']).sort_values('importance',ascending=False))
    
    #Printing Classification Report
    print("\nClassification Report :: \n", classification_report(test_y, predictions))
    
if __name__ == "__main__":
    main()

Train_x Shape ::  (700000, 19)
Train_y Shape ::  (700000,)
Test_x Shape ::  (300000, 19)
Test_y Shape ::  (300000,)
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=1000,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)
Train Accuracy ::  0.879077142857
Test Accuracy  ::  0.877466666667
 Confusion matrix  [[ 80468  30201]
 [  6559 182772]]

Feature Importance :: 
                              importance
CON_SERIES_CITE_COUNT          0.101087
CON_SERIES_PAPER_COUNT         0.096239
FIRST_AUTHOR_CITE_COUNT        0.092125
CON_SERIES_RANK                0.075522
LAST_AUTHOR_CITE_COUNT         0.072436
FIRST_AUTHOR_PAPER_COUNT       0.069791
FIRST_AUTHOR_RANK              0.064208
LAST_AUTHOR_PAPER_COUNT     

In [3]:
HEADERS = ["FIRST_AUTHOR_RANK", "FIRST_AUTHOR_PAPER_COUNT", "FIRST_AUTHOR_CITE_COUNT", "LAST_AUTHOR_RANK", "LAST_AUTHOR_PAPER_COUNT", "LAST_AUTHOR_CITE_COUNT", "LAST_AUTHOR_ORG_RANK", "LAST_AUTHOR_ORG_PAPER_COUNT", "LAST_AUTHOR_ORG_CITE_COUNT", "CON_SERIES_RANK", "CON_SERIES_PAPER_COUNT", "CON_SERIES_CITE_COUNT", "CON_INSTANCE_RANK", "CON_INSTANCE_PAPER_COUNT", "CON_INSTANCE_CITE_COUNT", "FOS_ID_HIGHEST", "FOS_LEVEL_HIGHEST", "FOS_PAPER_COUNT_HIGHEST", "FOS_CITE_COUNT_HIGHEST", "AGE", "CITED_CLASS", "CITE_CLASS_SCORE"]

def split_dataset(dataset, train_percentage, feature_headers, target_header):
    """
    Split the dataset with train_percentage
    :param dataset:
    :param train_percentage:
    :param feature_headers:
    :param target_header:
    :return: train_x, test_x, train_y, test_y
    """

    # Split dataset into train and test dataset
    train_x, test_x, train_y, test_y = train_test_split(dataset[feature_headers], dataset[target_header],
                                                        train_size=train_percentage)
    return train_x, test_x, train_y, test_y

def main():
    """
    Main function
    :return:
    """
    # Load the csv file into pandas dataframe
    df_reader = pd.read_csv(DATA_DIR + 'Con_Data_1m_Cited_ZScore.txt', dtype='float' , sep='\t', error_bad_lines=False, chunksize=CHUNK_SIZE)
    dataset = pd.concat(df_reader, ignore_index=True)


    train_x, test_x, train_y, test_y = split_dataset(dataset, 0.7, HEADERS[0:20], HEADERS[21])

    # Train and Test dataset size details
    print ("Train_x Shape :: ", train_x.shape)
    print ("Train_y Shape :: ", train_y.shape)
    print ("Test_x Shape :: ", test_x.shape)
    print ("Test_y Shape :: ", test_y.shape)

    
    # Create random forest classifier instance
    trained_model = XGBClassifier(n_estimators=1000)
    trained_model.fit(train_x, train_y)
    print(trained_model)
    predictions = trained_model.predict(test_x)

    # Train and Test Accuracy
    print ("Train Accuracy :: ", accuracy_score(train_y, trained_model.predict(train_x)))
    print ("Test Accuracy  :: ", accuracy_score(test_y, predictions))
    print (" Confusion matrix ", confusion_matrix(test_y, predictions))
    
    # Printing Feature Importance
    print ("\nFeature Importance :: \n", pd.DataFrame(trained_model.feature_importances_, index = train_x.columns, columns=['importance']).sort_values('importance',ascending=False))
    
    #Printing Classification Report
    print("\nClassification Report :: \n", classification_report(test_y, predictions))
    
if __name__ == "__main__":
    main()

Train_x Shape ::  (700000, 20)
Train_y Shape ::  (700000,)
Test_x Shape ::  (300000, 20)
Test_y Shape ::  (300000,)
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=1000,
       n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)
Train Accuracy ::  0.738451428571
Test Accuracy  ::  0.731936666667
 Confusion matrix  [[83408 23480  2225  1705]
 [ 7939 88378  6795  3921]
 [ 1102 18189 26524  6546]
 [  278  4319  3920 21271]]

Feature Importance :: 
                              importance
CON_SERIES_CITE_COUNT          0.101950
FIRST_AUTHOR_CITE_COUNT        0.099713
CON_SERIES_PAPER_COUNT         0.090729
LAST_AUTHOR_CITE_COUNT         0.085474
FIRST_AUTHOR_PAPER_COUNT       0.084318
LAST_AUTHOR_PAPER_COUNT        0.07

In [5]:
HEADERS = ["FIRST_AUTHOR_RANK", "FIRST_AUTHOR_PAPER_COUNT", "FIRST_AUTHOR_CITE_COUNT", "LAST_AUTHOR_RANK", "LAST_AUTHOR_PAPER_COUNT", "LAST_AUTHOR_CITE_COUNT", "LAST_AUTHOR_ORG_RANK", "LAST_AUTHOR_ORG_PAPER_COUNT", "LAST_AUTHOR_ORG_CITE_COUNT", "CON_SERIES_RANK", "CON_SERIES_PAPER_COUNT", "CON_SERIES_CITE_COUNT", "CON_INSTANCE_RANK", "CON_INSTANCE_PAPER_COUNT", "CON_INSTANCE_CITE_COUNT", "FOS_ID_HIGHEST", "FOS_LEVEL_HIGHEST", "FOS_PAPER_COUNT_HIGHEST", "FOS_CITE_COUNT_HIGHEST", "AGE", "CITED_CLASS", "CITE_CLASS_SCORE"]

def split_dataset(dataset, train_percentage, feature_headers, target_header):
    """
    Split the dataset with train_percentage
    :param dataset:
    :param train_percentage:
    :param feature_headers:
    :param target_header:
    :return: train_x, test_x, train_y, test_y
    """

    # Split dataset into train and test dataset
    train_x, test_x, train_y, test_y = train_test_split(dataset[feature_headers], dataset[target_header],
                                                        train_size=train_percentage)
    return train_x, test_x, train_y, test_y

def main():
    """
    Main function
    :return:
    """
    # Load the csv file into pandas dataframe
    df_reader = pd.read_csv(DATA_DIR + 'Con_Data_1m_Score_Sample.txt', dtype='float' , sep='\t', error_bad_lines=False, chunksize=CHUNK_SIZE)
    dataset = pd.concat(df_reader, ignore_index=True)


    train_x, test_x, train_y, test_y = split_dataset(dataset, 0.7, HEADERS[0:20], HEADERS[21])

    # Train and Test dataset size details
    print ("Train_x Shape :: ", train_x.shape)
    print ("Train_y Shape :: ", train_y.shape) 
    print ("Test_x Shape :: ", test_x.shape)
    print ("Test_y Shape :: ", test_y.shape)

    
    # Create random forest classifier instance
    trained_model = XGBClassifier(n_estimators=1000)
    trained_model.fit(train_x, train_y)
    print(trained_model)
    predictions = trained_model.predict(test_x)

    # Train and Test Accuracy
    print ("Train Accuracy :: ", accuracy_score(train_y, trained_model.predict(train_x)))
    print ("Test Accuracy  :: ", accuracy_score(test_y, predictions))
    print (" Confusion matrix ", confusion_matrix(test_y, predictions))
    
    # Printing Feature Importance
    print ("\nFeature Importance :: \n", pd.DataFrame(trained_model.feature_importances_, index = train_x.columns, columns=['importance']).sort_values('importance',ascending=False))
    
    #Printing Classification Report
    print("\nClassification Report :: \n", classification_report(test_y, predictions))
    
if __name__ == "__main__":
    main()

Train_x Shape ::  (700000, 20)
Train_y Shape ::  (700000,)
Test_x Shape ::  (300000, 20)
Test_y Shape ::  (300000,)
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=1000,
       n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)
Train Accuracy ::  0.713591428571
Test Accuracy  ::  0.709293333333
 Confusion matrix  [[91281  4644  8639  6993]
 [11583 32889 10558  7679]
 [ 7147  5987 35091 14619]
 [ 2493   151  6719 53527]]

Feature Importance :: 
                              importance
FIRST_AUTHOR_CITE_COUNT        0.106385
CON_SERIES_CITE_COUNT          0.091906
LAST_AUTHOR_CITE_COUNT         0.088806
CON_SERIES_PAPER_COUNT         0.084420
FIRST_AUTHOR_PAPER_COUNT       0.079959
LAST_AUTHOR_PAPER_COUNT        0.07