# Setup

In [1]:
%load_ext autoreload
%autoreload 2

# model
from dictionary_helpers import build_glove_dict
from tweet_processings import build_tweet_vector
from sklearn.neighbors import KNeighborsClassifier

# submission 
from create_csv_submission import create_csv_submission
import time
import datetime

# other
import numpy as np 
import os

# change path if necessary
import sys
my_path = r'D:\Documents\etudes\epfl\MA1\cours\MachineLearning\Project2'
sys.path.insert(0,my_path + r'/code/COMMON')

# Load Glove model from Stanford

In [3]:
# adapt path
os.chdir(r'D:/Documents/etudes/epfl/MA1/cours/MachineLearning/Project2/data/twitter_datasets_stanford/')

In [4]:
# filename
filename_glove_dict = 'glove.twitter.27B.100d.txt'

# build glove embeddings dictionary
glove = build_glove_dict(filename_glove_dict)

# Load stop words

In [5]:
# adapt path
os.chdir(r'D:/Documents/etudes/epfl/MA1/cours/MachineLearning/Project2/data/stop_words/')

In [6]:
# filename to read
filename_stopwords = 'stop_word_freq_min_100_ratio_marg_0.1.txt'

# build stop word list
stop_words = []
with open(filename_stopwords, 'r', encoding='utf-8-sig') as f:
    for line in f:
        stop_words.append(line.lstrip().split()[0])
    del stop_words[-1]
    
print("File :", filename_stopwords)
print("Number of stop words :", len(stop_words))

File : stop_word_freq_min_100_ratio_marg_0.1.txt
Number of stop words : 630


# Load tfidf 

In [7]:
# adapt path
os.chdir(r'D:/Documents/etudes/epfl/MA1/cours/MachineLearning/Project2/data/tfidf/')

In [8]:
# filename to read
filename_tfidf = 'tfidf.txt'

# build tfidf weights dictionary
tfidf = {}
with open(filename_tfidf, 'r', encoding='utf-8-sig') as f:
    next(f) # skip headers
    for line in f:
        word = line.strip().split()[0]
        tf = float(line.strip().split()[1])
        idf = float(line.strip().split()[2])
        tfidf[word] = [tf, idf]

# Define tweet vector method

In [None]:
# adapt path
os.chdir(r'D:/Documents/etudes/epfl/MA1/cours/MachineLearning/Project2/code/tom/')

In [None]:
# method to build tweet vector ("mean" and "tfidf")
method = ["mean"]

# Build tweet vectors TRAIN
Use the full tweet collection for buildingthe final classfier.

In [None]:
# adapt path
os.chdir(r'D:\Documents\etudes\epfl\MA1\cours\MachineLearning\Project2\data\twitter_datasets_epfl\full')

In [None]:
# build positive tweet feature set
X_pos = []
with open('train_pos_processed.txt') as f:
    for line in f:
        tweet = line.lstrip().split()
        tweet_vector = build_tweet_vector(tweet, glove, tfidf, stop_words, method)
        if len(tweet_vector):
            X_pos.append(tweet_vector)
            
# transform to an array     
X_pos = np.array(X_pos)

In [None]:
# build negative tweet feature set
X_neg = []
with open('train_neg_processed.txt') as f:
    for line in f:
        tweet = line.lstrip().split()
        tweet_vector = build_tweet_vector(tweet, glove, tfidf, stop_words, method)
        if len(tweet_vector):
            X_neg.append(tweet_vector)
            
# transform to an array           
X_neg = np.array(X_neg) 

In [None]:
# build labels
y_pos = np.ones(X_pos.shape[0])
y_neg = -np.ones(X_neg.shape[0])

In [None]:
# number of training samples
N_samples_train = -1

# cut samples
X_pos_cut = X_pos[:N_samples_train,:]
X_neg_cut = X_neg[:N_samples_train,:]

# cut targets
y_pos_cut = y_pos[:N_samples_train]
y_neg_cut = y_neg[:N_samples_train]

# concatenate
X_pos_neg = np.concatenate([X_pos_cut, X_neg_cut])
y_pos_neg = np.concatenate([y_pos_cut, y_neg_cut])

# Build tweet vectors TEST

In [None]:
import os
os.chdir(r'D:\Documents\etudes\epfl\MA1\cours\MachineLearning\Project2\data\twitter_datasets_epfl\short')

In [None]:
X_test = []

with open('test_data_no_id_processed.txt') as f:
    for line in f:
        tweet = line.lstrip().split()
        tweet_vector = build_tweet_vector(tweet, glove, tfidf, stop_words, method)
        if len(tweet_vector):
            X_test.append(tweet_vector)
        
X_test = np.array(X_test) 
print(X_test.shape)

# Standardization

In [None]:
# set to "True" to standardize
ifStandardize = False

In [None]:
from sklearn.preprocessing import StandardScaler

if ifStandardize:
    
    scaler = StandardScaler()
    scaler.fit(X_pos_neg)
    X_pos_neg = scaler.transform(X_pos_neg)
    X_test = scaler.transform(X_test)

# K-nearest neighbors model

In [None]:
# init
K = 100
clf = KNeighborsClassifier(n_neighbors=K)

# fit
clf.fit(X_pos_neg, y_pos_neg) 

# predict
y_pred = clf.predict(X_test)

# Submission

In [None]:
# adapt path
os.chdir(r'D:/Documents/etudes/epfl/MA1/cours/MachineLearning/Project2/data/submissions/')

In [None]:
# output file name
i = datetime.datetime.now()
name = "sub_" + time.strftime("%d_%m_%Y") +  "_%sh_%smin" % (i.hour, i.minute)
ids_test = range(1, test_arrays.shape[0]+1)

# write submission file
create_csv_submission(ids_test, y_pred, name)