# Experimentation for nlp hw1

In [2]:
%load_ext autoreload
%autoreload 2
from preprocessing import *
from speech import *
import numpy as np
from tqdm import tqdm
from supervised_experiments import *
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
fname = "speech.tar.gz"

### General submission code

In [8]:
data = Data(fname)
data.preprocess(CountVectorizer(tokenizer = LemmaTokenizer(), 
                                stop_words = "english", min_df = 2),
                svd = TruncatedSVD(n_components = 2500))
clf = LogisticRegression(solver = "saga", penalty = 'l2')
clf.fit(data.train_x, data.train_y)

# Train performance
preds = clf.predict(data.val_x)
train_acc = accuracy_score(clf.predict(data.train_x), data.train_y)

#Val performance
val_acc =  accuracy_score(data.val_y, preds)
ic(val_acc)


ic("Reading unlabeled data")
unlabeled = read_unlabeled("data/" + fname, data)
print("Writing pred file")
write_pred_kaggle_file(unlabeled, clf, "speech-pred.csv", data)

# # # You can't run this since you do not have the true labels
# # # ic "Writing gold file"
# # # write_gold_kaggle_file("data/speech-unlabeled.tsv", "data/speech-gold.csv")
# # # w:rite_basic_kaggle_file("data/speech-unlabeled.tsv", "data/speech-basic.csv")

ic| '-- train data'
ic| member.name: 'train.tsv'
ic| len(self.train_data): 4370
ic| '-- val data'
ic| member.name: 'dev.tsv'
ic| len(self.val_data): 414
ic| val_acc: 0.4251207729468599
ic| 'Reading unlabeled data'
ic| unlabeled.X.shape: (43342, 2500)


Writing pred file


## Supervised Experiments

### Initial exploration

In [37]:
fname = "speech.tar.gz"
preprocessors = [
    CountVectorizer(),
    CountVectorizer(stop_words="english"),
    CountVectorizer(stop_words="english", tokenizer = LemmaTokenizer()),
    CountVectorizer(stop_words="english", tokenizer = LemmaTokenizer(), min_df = 2),
    CountVectorizer(stop_words="english", tokenizer = LemmaTokenizer(), min_df = 3),
    CountVectorizer(stop_words="english", tokenizer = LemmaTokenizer(), min_df = 4),
    CountVectorizer(stop_words="english", tokenizer = LemmaTokenizer(), min_df = 5),
    CountVectorizer(stop_words="english", tokenizer = LemmaTokenizer(), min_df = 6),
    CountVectorizer(stop_words="english", tokenizer = LemmaTokenizer(), min_df = 7),
    CountVectorizer(stop_words="english", tokenizer = LemmaTokenizer(), min_df = 8),
    CountVectorizer(stop_words="english", tokenizer = LemmaTokenizer(), min_df = 9),
    CountVectorizer(stop_words="english", tokenizer = LemmaTokenizer(), min_df = 10),
]

d = dimensionality_exploration(fname, preprocessors)

ic| '-- train data'
ic| member.name: 'train.tsv'
ic| len(self.train_data): 4370
ic| '-- val data'
ic| member.name: 'dev.tsv'
ic| len(self.val_data): 414


In [38]:
d

[7916, 7645, 7426, 3641, 2561, 2001, 1654, 1418, 1247, 1125, 993, 895]

In [3]:
fname = "speech.tar.gz"

feat_list = {
    "cv":CountVectorizer(),
    "cv_lemma": CountVectorizer(tokenizer = LemmaTokenizer()),
    "cv_stopw": CountVectorizer(stop_words = "english"),
    "cv_lemma_stopw":CountVectorizer(stop_words = "english", tokenizer = LemmaTokenizer()),
    "tfidf":TfidfVectorizer(),
    "tfidf_lemma": TfidfVectorizer(tokenizer = LemmaTokenizer()),
    "tfidf_stopw": TfidfVectorizer(stop_words = "english"),
    "tfidf_lemma_stopw":TfidfVectorizer(stop_words = "english", tokenizer = LemmaTokenizer()),
}

### Feature ablation

In [8]:
a1 = feature_ablation(fname, feat_list)

ic| '-- train data'
ic| member.name: 'train.tsv'
ic| len(self.train_data): 4370
ic| '-- val data'
ic| member.name: 'dev.tsv'
ic| len(self.val_data): 414
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or sca

In [9]:
a1

{'cv': 0.41304347826086957,
 'cv_lemma': 0.43719806763285024,
 'cv_stopw': 0.39855072463768115,
 'cv_lemma_stopw': 0.4227053140096618,
 'tfidf': 0.3743961352657005,
 'tfidf_lemma': 0.38164251207729466,
 'tfidf_stopw': 0.38164251207729466,
 'tfidf_lemma_stopw': 0.3743961352657005}

### Dimensionality Ablation

In [15]:
data = Data(fname)
data.preprocess(feat_list["cv_lemma"], norm = True)
print(data.train_x.shape)
print(data.val_x.shape)
# sys.exit(1)
comp_list = [100,500, 1000, 1500, 2000, 2500, 3000, 4000, 5000]
a2 = dimensionality_ablation(data, comp_list)

ic| '-- train data'
ic| member.name: 'train.tsv'
ic| len(self.train_data): 4370
ic| '-- val data'
ic| member.name: 'dev.tsv'
ic| len(self.val_data): 414


(4370, 7689)
(414, 7689)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [16]:
a2

{100: 0.2777777777777778,
 500: 0.32608695652173914,
 1000: 0.3357487922705314,
 1500: 0.33816425120772947,
 2000: 0.33816425120772947,
 2500: 0.33816425120772947,
 3000: 0.34299516908212563,
 4000: 0.34057971014492755,
 5000: 0.33816425120772947}

In [17]:
data = Data(fname)
data.preprocess(feat_list["cv_lemma"], norm = False)
print(data.train_x.shape)
print(data.val_x.shape)
# sys.exit(1)
comp_list = [100,500, 1000, 1500, 2000, 2500, 3000, 4000, 5000]
a2_2 = dimensionality_ablation(data, comp_list)

ic| '-- train data'
ic| member.name: 'train.tsv'
ic| len(self.train_data): 4370
ic| '-- val data'
ic| member.name: 'dev.tsv'
ic| len(self.val_data): 414


(4370, 7689)
(414, 7689)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [18]:
a2_2

{100: 0.30434782608695654,
 500: 0.37922705314009664,
 1000: 0.4106280193236715,
 1500: 0.4227053140096618,
 2000: 0.43719806763285024,
 2500: 0.4444444444444444,
 3000: 0.4396135265700483,
 4000: 0.4323671497584541,
 5000: 0.43719806763285024}

### Model ablation

In [1]:
data = Data(fname)
data.preprocess(feat_list["cv_lemma"], norm = False)
svd = TruncatedSVD(n_components = 2500)
data.train_x = svd.fit_transform(data.train_x)
data.val_x = svd.transform(data.val_x)
solvers = ["lbfgs", "liblinear", "saga", "newton-cg", "sag"]
penalties = ["l1", "l2", 'none']
a3 = solver_pen_ablation(data.train_x, data.train_y, data.val_x, data.val_y, solvers, penalties)

NameError: name 'Data' is not defined

In [29]:
a3

{'l1': {'lbfgs': '-',
  'liblinear': 0.42995169082125606,
  'saga': 0.43478260869565216,
  'newton-cg': '-',
  'sag': '-'},
 'l2': {'lbfgs': 0.43478260869565216,
  'liblinear': 0.4323671497584541,
  'saga': 0.4420289855072464,
  'newton-cg': 0.4420289855072464,
  'sag': 0.4420289855072464},
 'none': {'lbfgs': 0.4057971014492754,
  'liblinear': '-',
  'saga': 0.45652173913043476,
  'newton-cg': 0.40096618357487923,
  'sag': 0.4396135265700483}}

In [15]:
ic("Reading data")
tarfname = "speech.tar.gz"
speech = Data(tarfname)
speech.preprocess()

# train_extra_data = np.zeros((len(speech.train_data), 4))
# val_extra_data = np.zeros((len(speech.val_data), 4))
# lkp_dict = {"-":0,
#             ".":1,
#            ",":2,
#            ";":3,
#            }

# for i, sentence in tqdm(enumerate(speech.train_data)):
#     for char in sentence.decode("utf-8"):
#         if char in lkp_dict:
#             train_extra_data[i,lkp_dict[char]] += 1

# print("Validation")
# for i, sentence in tqdm(enumerate(speech.val_data)):
#     for char in sentence.decode("utf-8"):
#         if char in lkp_dict:
#             val_extra_data[i,lkp_dict[char]] += 1
            
# print(speech.train_x.shape)

# speech.train_x = scipy.sparse.hstack((speech.train_x, csr_matrix(train_extra_data)))
# speech.val_x = scipy.sparse.hstack((speech.val_x, csr_matrix(val_extra_data)))
# print(speech.train_x.shape)
# print(speech.val_x.shape)


ic| 'Reading data'
ic| '-- train data'
ic| member.name: 'train.tsv'
ic| len(self.train_data): 4370
ic| '-- val data'
ic| member.name: 'dev.tsv'
ic| len(self.val_data): 414


In [16]:
print(speech.train_x.shape)

(4370, 7426)


In [None]:
import pickle as pkl

with open("val_extra_data.pkl", "wb") as file:
    pkl.dump(val_extra_data, file)

In [None]:
# Without stopwords
import matplotlib.pyplot as plt
plt.figure(figsize=(12,8))
plt.plot(range(len(vocabulary_sorted)), [item[1] for item in vocabulary_sorted])


In [None]:
speech.count_vect = CountVectorizer(stop_words = "english")#, tokenizer = LemmaTokenizer())
matrix = speech.count_vect.fit_transform(speech.train_data)
print(matrix.shape)
freqs = zip(speech.count_vect.get_feature_names(), matrix.sum(axis=0).tolist()[0])    
# sort from largest to smallest
vocabulary_sorted = sorted(freqs, key=lambda x: -x[1])
print(type(vocabulary_sorted))

In [None]:
from nltk.stem.wordnet import WordNetLemmatizer as lemmatizer
lemmatizer = lemmatizer()
voc_l = [lemmatizer.lemmatize(i[0]) for i in vocabulary_sorted]
print(len(list(set(voc_l))))


In [None]:
# Without stopwords
import matplotlib.pyplot as plt
plt.figure(figsize=(12,8))
plt.plot(range(len(vocabulary_sorted)), [item[1] for item in vocabulary_sorted])


## Unsupervised experiments

In [3]:
data = read_unlabeled("data/" + 
                      fname, None)

In [7]:
print(len(data.data))
print(data.data[0])

43342
b'must always seek to protect our national security by aggressively gathering intelligence in accordance with proven methods. Yet we cannot'


In [44]:
tokenizer = CountVectorizer(stop_words = "english",
#                             tokenizer = LemmaTokenizer(),
                           )
                            
def sentence_parser(sentences,tokenizer, lemmatizer):
    res_sentences = []
    tokenizer = tokenizer.build_tokenizer()
    
    for s in tqdm(sentences):
        s = tokenizer(s.decode('utf-8'))
        res_sentences.append(
        [lemmatizer.lemmatize(w).lower() for w in s])
#     print(sentences)
    return res_sentences
    

In [45]:
data.parsed_unlabeled_data = sentence_parser(data.data, tokenizer, WordNetLemmatizer())

100%|██████████| 43342/43342 [00:04<00:00, 9368.89it/s]


In [46]:
labeled_data = Data(fname)
data.parsed_labeled_data = sentence_parser(labeled_data.train_data, 
                                           tokenizer, 
                                           WordNetLemmatizer())
                                           

ic| '-- train data'
ic| member.name: 'train.tsv'
ic| len(self.train_data): 4370
ic| '-- val data'
ic| member.name: 'dev.tsv'
ic| len(self.val_data): 414
100%|██████████| 4370/4370 [00:00<00:00, 10393.85it/s]


### Vectorize input in preparation for Word2Vec

In [None]:
from gensim.scripts.glove2word2vec import glove2word2vec
glove_input_file = 'glove.txt'
word2vec_output_file = 'word2vec.txt'
glove2word2vec(glove_input_file, word2vec_output_file)

In [1]:
import gensim.downloader as api
import gensim

In [3]:
# wv = api.load('word2vec-google-news-300')

In [5]:
wv['dog'].shape

(300,)