In [1]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd



In [2]:
train = pd.read_csv('/Users/xianglongtan/Desktop/cmps242/week6/train_clean.csv',encoding='ISO-8859-1')
test = pd.read_csv('/Users/xianglongtan/Desktop/cmps242/week6/test_clean.csv',encoding='ISO-8859-1')

In [3]:
train.head()

Unnamed: 0,handle,tweet
0,realDonaldTrump,"In Tampa, Florida- thank you to all of our out..."
1,realDonaldTrump,Poll: <mention> realDonaldTrump vs. <mention> ...
2,HillaryClinton,Obama on whether Trump could be trusted with U...
3,HillaryClinton,"""Hillary Clinton has never quit on anything in..."
4,realDonaldTrump,I LOVE NEW YORK! <hashtag> NewYorkValues <url>


# Preprocessing Training Set

In [4]:
# tokenization
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stop_words = set(stopwords.words('english'))
filter_tweet = [tweet for tweet in train.tweet.values]
word_token = [word_tokenize(tweet) for tweet in filter_tweet]
filter_words = []
for word in word_token:
    filter_words.append([w for w in word if not w in stop_words])
filter_tweet = []
for word in filter_words:
    filter_tweet.append(' '.join(word))

In [5]:
# process label
train[['D','H']] = pd.get_dummies(train['handle'])

In [6]:
train.head()

Unnamed: 0,handle,tweet,D,H
0,realDonaldTrump,"In Tampa, Florida- thank you to all of our out...",0,1
1,realDonaldTrump,Poll: <mention> realDonaldTrump vs. <mention> ...,0,1
2,HillaryClinton,Obama on whether Trump could be trusted with U...,1,0
3,HillaryClinton,"""Hillary Clinton has never quit on anything in...",1,0
4,realDonaldTrump,I LOVE NEW YORK! <hashtag> NewYorkValues <url>,0,1


# tfidf transform

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
vectorizer = CountVectorizer()
train_token = vectorizer.fit_transform(filter_tweet)
transformer = TfidfTransformer(smooth_idf = True)
tfidf_train_token = transformer.fit_transform(train_token)
tfidf_train_token = tfidf_train_token.toarray()
colname = vectorizer.get_feature_names()
df_token = pd.DataFrame(data=tfidf_train_token,columns=colname)


In [8]:
df_token.shape

(5000, 8410)

# PCA

In [12]:
# Standardizer
from sklearn import preprocessing
standardScaler = preprocessing.StandardScaler(with_mean=False)
standard_df_token = standardScaler.fit_transform(df_token)

In [13]:
# PCA
from sklearn.decomposition import PCA
pca = PCA(n_components=2000)
df_token_pca=pca.fit_transform(standard_df_token)
pca.explained_variance_ratio_.sum()

0.7597458477202623

# Train with xgboost

In [56]:
X = df_token_pca
Y = train.D

(5000, 3000)
(5000,)


In [39]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
seed = 7
test_size = 0.3
X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size=test_size,random_state=seed)

In [40]:
model = XGBClassifier(learning_rate = 0.001, max_depth = 20, colsample_bytree = 1.0, objective='binary:logistic')
import time
start = time.time()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_test, predictions)
end = time.time()
print(end-start)


381.28466796875


In [68]:
print('test accuracy: ',(accuracy * 100.0))
print('train accuracy: ', accuracy_score(y_train, model.predict(X_train)))

test accuracy:  88.66666666666667
train accuracy:  0.9685714285714285


array([0, 1, 1, ..., 1, 0, 1], dtype=uint8)

# Cross Validation

In [104]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
model = XGBClassifier()
learning_rate = [0.0001, 0.001,0.01]
max_depth = [2,4,6]
subsample = [0.2, 0.4, 0.6, 0.8, 1.0]
colsample_bytree = [0.2, 0.4, 0.6, 0.8, 1.0]
n_estimator = range(100, 500, 50)

param_grid = dict(learning_rate = learning_rate,
                 max_depth = max_depth)
                 #subsample = subsample,
                 #colsample_bytree = colsample_bytree)
                 #n_estimator = n_estimator)

In [105]:

kfold = StratifiedKFold(n_splits = 10, shuffle=True,random_state = seed)
grid_search = GridSearchCV(model, param_grid, scoring = 'accuracy', cv=kfold)
start = time.time()
grid_result = grid_search.fit(X,Y)
end = time.time()
print(end-start)

KeyboardInterrupt: 

In [None]:
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean,stdev,param in zip(means, stds, params):
    print('%f (%f) with: %r'%(mean,stdev,param))

In [None]:
print('best score: ',grid_search.best_score_,'\nbest param:',grid_result.best_params_)

# predict testset

In [14]:
test_ = [tweet for tweet in test.tweet.values]
test_ = [word_tokenize(tweet) for tweet in test_]
filter_test = []
for word in test_:
    filter_test.append([w for w in word if not w in stop_words])
filter_tweet_test = []
for word in filter_test:
    filter_tweet_test.append(' '.join(word))
test_token = vectorizer.transform(filter_tweet_test)
tfidf_test_token = transformer.transform(test_token)
tfidf_test_token = tfidf_test_token.toarray()
colname = vectorizer.get_feature_names()
df_token_test = pd.DataFrame(data=tfidf_test_token,columns=colname)



In [15]:
# pca
standard_test_token = standardScaler.transform(df_token_test)
test_token_pca = pca.transform(standard_test_token)
pca.explained_variance_ratio_.sum()

0.7597458477202623

In [16]:
X_test = test_token_pca# pca
#X_test = df_token_test# non pca
X_test.shape
#X = df_token# non pca
X = df_token_pca# pca
Y = train.H
X.shape

(5000, 2000)

In [16]:
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
model = XGBClassifier(learning_rate = 0.001, max_depth = 10, colsample_bytree = 0.8,subsample = 0.8, objective='binary:logistic')
model.fit(X,Y)
pred = model.predict(X)
accuracy = accuracy_score(Y,pred)
#train_preds = bst.predict(X_train)
train_predictions = [round(value) for value in pred]
accuracy_score(Y, train_predictions)

0.9786

In [17]:
print(accuracy)
print(pred)

0.9786
[1 1 0 ... 0 0 0]


In [18]:
result = model.predict(X_test)
test_pred = pd.DataFrame(data=result,columns=['realDonaldTrump'])
test_pred['HillaryClinton'] = 1- test_pred['realDonaldTrump']
test_pred.index.name = 'id'
test_pred.to_csv('HW5_XGB.csv')

In [19]:
train['pred_D_p'] = pred
train['pred_D'] = train_predictions
train.head(30)

Unnamed: 0,handle,tweet,D,H,pred_D_p,pred_D
0,realDonaldTrump,"In Tampa, Florida- thank you to all of our out...",0,1,1,1
1,realDonaldTrump,Poll: <mention> realDonaldTrump vs. <mention> ...,0,1,1,1
2,HillaryClinton,Obama on whether Trump could be trusted with U...,1,0,0,0
3,HillaryClinton,"""Hillary Clinton has never quit on anything in...",1,0,0,0
4,realDonaldTrump,I LOVE NEW YORK! <hashtag> NewYorkValues <url>,0,1,1,1
5,HillaryClinton,"""In times like these, we need a president who ...",1,0,0,0
6,realDonaldTrump,Doesn't fit the MSM narrative - so they wont s...,0,1,1,1
7,HillaryClinton,BREAKING: Clinton is projected winner in U.S. ...,1,0,0,0
8,HillaryClinton,"This fall, USPS will honor joyous Hindu Festiv...",1,0,0,0
9,realDonaldTrump,HILLARY'S BAD TAX HABIT! <url>,0,1,1,1


In [18]:
import xgboost as xgb
params = {
    'booster': 'gbtree',
    'objective': 'binary:logistic',
    #'num_class': 2,
    'gamma': 0.1,
    'max_depth': 6,
    'lambda': 2,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'min_child_weight': 3,
    'silent': 1,
    'eta': 0.1,
    'seed': 1000,
    'nthread': 4,
}
plst = params.items()
dtrain =xgb.DMatrix(X,Y)
num_rounds = 500
model = xgb.train(plst, dtrain, num_rounds)
dtest = xgb.DMatrix(X_test)
ans = model.predict(dtest)
ans.shape

(1444,)

In [20]:
X_train = xgb.DMatrix(X)
ans_train = model.predict(X_train)
train['pred_xgb2'] = ans_train
train.head()

Unnamed: 0,handle,tweet,D,H,pred_xgb2
0,realDonaldTrump,"In Tampa, Florida- thank you to all of our out...",0,1,0.998947
1,realDonaldTrump,Poll: <mention> realDonaldTrump vs. <mention> ...,0,1,0.990506
2,HillaryClinton,Obama on whether Trump could be trusted with U...,1,0,0.000164
3,HillaryClinton,"""Hillary Clinton has never quit on anything in...",1,0,0.000374
4,realDonaldTrump,I LOVE NEW YORK! <hashtag> NewYorkValues <url>,0,1,0.993226


In [21]:
test_pred2 = pd.DataFrame(data=ans,columns=['realDonaldTrump'])
test_pred2['HillaryClinton'] = 1- test_pred2['realDonaldTrump']
test_pred2.index.name = 'id'
test_pred2.to_csv('HW5_XGB2.csv')