In [1]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd


train = pd.read_csv('/Users/xianglongtan/Desktop/cmps242/week6/train_clean.csv',encoding='ISO-8859-1')
test = pd.read_csv('/Users/xianglongtan/Desktop/cmps242/week6/test_clean.csv',encoding='ISO-8859-1')

# preprocess data

In [2]:
# tokenization
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stop_words = set(stopwords.words('english'))
filter_tweet = [tweet for tweet in train.tweet.values]
word_token = [word_tokenize(tweet) for tweet in filter_tweet]
filter_words = []
for word in word_token:
    filter_words.append([w for w in word if not w in stop_words])
filter_tweet = []
for word in filter_words:
    filter_tweet.append(' '.join(word))

In [3]:
# process label
train[['D','H']] = pd.get_dummies(train['handle'])

In [4]:
test_ = [tweet for tweet in test.tweet.values]
test_ = [word_tokenize(tweet) for tweet in test_]
filter_test = []
for word in test_:
    filter_test.append([w for w in word if not w in stop_words])
filter_tweet_test = []
for word in filter_test:
    filter_tweet_test.append(' '.join(word))

# tfidf transform

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
vectorizer = CountVectorizer()
train_token = vectorizer.fit_transform(filter_tweet)
transformer = TfidfTransformer(smooth_idf = True)
tfidf_train_token = transformer.fit_transform(train_token)
tfidf_train_token = tfidf_train_token.toarray()
colname = vectorizer.get_feature_names()
df_token = pd.DataFrame(data=tfidf_train_token,columns=colname)




In [6]:
test_token = vectorizer.transform(filter_tweet_test)
tfidf_test_token = transformer.transform(test_token)
tfidf_test_token = tfidf_test_token.toarray()
colname = vectorizer.get_feature_names()
df_token_test = pd.DataFrame(data=tfidf_test_token,columns=colname)

# PCA

In [7]:
# Standardizer
from sklearn import preprocessing
standardScaler = preprocessing.StandardScaler(with_mean=False)
standard_df_token = standardScaler.fit_transform(df_token)
# PCA
from sklearn.decomposition import PCA
pca = PCA(n_components=2000)
df_token_pca=pca.fit_transform(standard_df_token)
pca.explained_variance_ratio_.sum()

0.7597461879639229

In [8]:
standard_test_token = standardScaler.transform(df_token_test)
test_token_pca = pca.transform(standard_test_token)
pca.explained_variance_ratio_.sum()

0.7597461879639229

(1444, 3000)

# Train with KNN

In [7]:
X = df_token_pca
Y = 
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
X_train,X_test,y_train,y_test = train_test_split(X,Y)
knn = KNeighborsClassifier()
knn.fit(X_train,y_train)
y_pred = knn.predict(X_test)
prediction = [round(value) for value in y_pred]
accuracy = accuracy_score(y_test,prediction)




In [8]:
print(accuracy)

0.892


# Cross Validation

In [11]:
param_grid = [{'n_neighbors':range(5,20,1)},
             {'weights':['uniform','distance']}]
model = KNeighborsClassifier()
seed=4
import time
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
kfold = StratifiedKFold(n_splits = 10, shuffle=True,random_state = seed)
grid_search = GridSearchCV(model, param_grid, scoring = 'accuracy', cv=kfold)
start = time.time()
grid_result = grid_search.fit(X,Y)
end = time.time()
print(end-start)



8587.638632774353


In [12]:
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean,stdev,param in zip(means, stds, params):
    print('%f (%f) with: %r'%(mean,stdev,param))

0.806200 (0.009703) with: {'n_neighbors': 5}
0.807400 (0.017025) with: {'n_neighbors': 6}
0.801400 (0.017340) with: {'n_neighbors': 7}
0.790200 (0.025715) with: {'n_neighbors': 8}
0.787200 (0.024170) with: {'n_neighbors': 9}
0.767200 (0.027356) with: {'n_neighbors': 10}
0.766000 (0.022420) with: {'n_neighbors': 11}
0.752400 (0.025580) with: {'n_neighbors': 12}
0.751200 (0.027852) with: {'n_neighbors': 13}
0.740000 (0.025782) with: {'n_neighbors': 14}
0.736600 (0.024555) with: {'n_neighbors': 15}
0.725200 (0.024638) with: {'n_neighbors': 16}
0.722800 (0.026580) with: {'n_neighbors': 17}
0.707800 (0.024412) with: {'n_neighbors': 18}
0.702600 (0.025899) with: {'n_neighbors': 19}
0.806200 (0.009703) with: {'weights': 'uniform'}
0.808800 (0.009962) with: {'weights': 'distance'}


In [13]:
print('best score: ',grid_search.best_score_,'\nbest param:',grid_result.best_params_)

best score:  0.8088 
best param: {'weights': 'distance'}


# predict testset

In [9]:
X_test = test_token_pca# pca
#X_test = df_token_test# non pca
X_test.shape
#X = df_token# non pca
X = df_token_pca# pca
Y = train.H
X.shape

(5000, 2000)

In [13]:
from sklearn.neighbors import KNeighborsClassifier 
knn = KNeighborsClassifier(weights='distance')
model = knn.fit(X,Y)
result = model.predict_proba(X_test)


In [14]:
result

array([[0.8083026 , 0.1916974 ],
       [0.80153077, 0.19846923],
       [0.19704665, 0.80295335],
       ...,
       [0.3994271 , 0.6005729 ],
       [1.        , 0.        ],
       [0.15225916, 0.84774084]])

In [11]:
test_pred = pd.DataFrame(data=result,columns=['realDonaldTrump'])


In [12]:
test_pred['HillaryClinton'] = 1- test_pred['realDonaldTrump']

In [13]:
test_pred.index.name = 'id'
test_pred.to_csv('HW5_KNN.csv')

In [15]:
result_train = model.predict_proba(X)

In [15]:
train['predict'] = result_train

In [16]:
print(result_train[0:5,:])
train.head()

[[0. 1.]
 [0. 1.]
 [1. 0.]
 [1. 0.]
 [0. 1.]]


Unnamed: 0,handle,tweet,D,H
0,realDonaldTrump,"In Tampa, Florida- thank you to all of our out...",0,1
1,realDonaldTrump,Poll: <mention> realDonaldTrump vs. <mention> ...,0,1
2,HillaryClinton,Obama on whether Trump could be trusted with U...,1,0
3,HillaryClinton,"""Hillary Clinton has never quit on anything in...",1,0
4,realDonaldTrump,I LOVE NEW YORK! <hashtag> NewYorkValues <url>,0,1


In [17]:
from sklearn.metrics import accuracy_score
accuracy_score(result_train, train.H)

0.9998

In [18]:
result2 = model.predict_proba(X_test)
test_pred2 = pd.DataFrame(data=result2,columns=['HillaryClinton','realDonaldTrump'])
#test_pred2['HillaryClinton'] = 1- test_pred2['realDonaldTrump']
test_pred2.index.name = 'id'
test_pred2.to_csv('HW5_KNN2.csv')