# Setup

In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

# model
from dictionary_helpers import build_glove_dict
from tweet_processings import build_tweet_vector
from sklearn.neighbors import KNeighborsClassifier

# hyperparameter optimization
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier

# submission 
from create_csv_submission import create_csv_submission
import time
import datetime

# other
import numpy as np 
import os
import matplotlib.pyplot as plt
from matplotlib.colors import Normalize

# change path if necessary
import sys
my_path = r'D:\Documents\etudes\epfl\MA1\cours\MachineLearning\Project2'
sys.path.insert(0,my_path + r'/code/COMMON')

# Load Glove model from Stanford

In [None]:
# adapt path
os.chdir(r'D:/Documents/etudes/epfl/MA1/cours/MachineLearning/Project2/data/twitter_datasets_stanford/')

In [None]:
# filename
filename_glove_dict = 'glove.twitter.27B.100d.txt'

# build glove embeddings dictionary
glove = build_glove_dict(filename_glove_dict)

# Load stop words

In [None]:
# adapt path
os.chdir(r'D:/Documents/etudes/epfl/MA1/cours/MachineLearning/Project2/data/stop_words/')

In [None]:
# filename to read
filename_stopwords = 'stop_word_freq_min_100_ratio_marg_0.1.txt'

# build stop word list
stop_words = []
with open(filename_stopwords, 'r', encoding='utf-8-sig') as f:
    for line in f:
        stop_words.append(line.lstrip().split()[0])
    del stop_words[-1]
    
print("File :", filename_stopwords)
print("Number of stop words :", len(stop_words))

# Load tfidf 

In [None]:
# adapt path
os.chdir(r'D:/Documents/etudes/epfl/MA1/cours/MachineLearning/Project2/data/tfidf/')

In [None]:
# filename to read
filename_tfidf = 'tfidf.txt'

# build tfidf weights dictionary
tfidf = {}
with open(filename_tfidf, 'r', encoding='utf-8-sig') as f:
    next(f) # skip headers
    for line in f:
        word = line.strip().split()[0]
        tf = float(line.strip().split()[1])
        idf = float(line.strip().split()[2])
        tfidf[word] = [tf, idf]

In [None]:
# choose to use tfidf
UseTfidf = False

# Build tweet vectors TRAIN
Use the short tweet collection for the hyper-parameter optimization.

In [None]:
# adapt path
os.chdir(r'D:\Documents\etudes\epfl\MA1\cours\MachineLearning\Project2\data\twitter_datasets_epfl\short')

In [None]:
# build positive tweet feature set
X_pos = []
with open('train_pos_processed.txt') as f:
    for line in f:
        tweet = line.lstrip().split()
        tweet_vector = build_tweet_vector(tweet, glove, tfidf, stop_words, UseTfidf)
        if len(tweet_vector):
            X_pos.append(tweet_vector)
            
# transform to an array     
X_pos = np.array(X_pos)

In [None]:
# build negative tweet feature set
X_neg = []
with open('train_neg_processed.txt') as f:
    for line in f:
        tweet = line.lstrip().split()
        tweet_vector = build_tweet_vector(tweet, glove, tfidf, stop_words, method)
        if len(tweet_vector):
            X_neg.append(tweet_vector)
            
# transform to an array           
X_neg = np.array(X_neg) 

In [None]:
# build labels
y_pos = np.ones(X_pos.shape[0])
y_neg = -np.ones(X_neg.shape[0])

In [None]:
# number of training samples
N_samples_train = -1

# cut samples
X_pos_cut = X_pos[:N_samples_train,:]
X_neg_cut = X_neg[:N_samples_train,:]

# cut targets
y_pos_cut = y_pos[:N_samples_train]
y_neg_cut = y_neg[:N_samples_train]

# concatenate
X_pos_neg = np.concatenate([X_pos_cut, X_neg_cut])
y_pos_neg = np.concatenate([y_pos_cut, y_neg_cut])

# Hyperparameter optimization (K)

## Grid search and cross validation

In [None]:
# data
X = X_pos_neg
y = y_pos_neg

# Split the dataset in two equal parts
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

# range for hyperparameters
K_range = np.arange(1,)

# Set the parameters by cross-validation
tuned_parameters = [{'n_neighbors': K_range}]
                    
# define grid search CV
clf = GridSearchCV(KNeighborsClassifier(), tuned_parameters, cv=5, scoring= 'accuracy', verbose=1)

# fit for every parameters combinations in grid search CV
clf.fit(X_train, y_train)

## Display results

In [None]:
print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print()
print("Grid scores on development set:")
print()

means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))
    
print()
print("Detailed classification report:")
print()
print("The model is trained on the full development set.")
print("The scores are computed on the full evaluation set.")
print()
y_true, y_pred = y_test, clf.predict(X_test)
print(classification_report(y_true, y_pred))
print()

In [None]:
# score
scores = clf.cv_results_['mean_test_score']

# plot
plt.figure()
plt.plot(K_range, scores)
plt.xlabel('Number of Neighbors K')
plt.ylabel('Accuracy')
plt.grid()