## Construction des features et Word embedding

In [2]:
import time
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

#### Lecture des données

In [3]:
data_train = pd.read_csv("../tweet_data/train.csv").fillna("")
data_train_clean_stem = pd.read_csv("../tweet_data/train_clean_stem.csv").fillna("")

In [4]:
# Colonne "text" en matrices pour l'extraction de features
text_train_array = data_train_clean_stem["text_clean_lem"].values
label_train_array = data_train_clean_stem["sentiment"].values

### TF IDF

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
vec = TfidfVectorizer(ngram_range=(1,1), norm = False)
data_train_TFIDF = vec.fit_transform(text_train_array)

In [6]:
# vocabulaire extrait des features
vocabulary = vec.get_feature_names()
N_vocabulary = len(vocabulary)
N_vocabulary

26283

In [7]:
text_train_array

array(['spent entire morning meeting w vendor bos happy w lot fun plan morning',
       'oh good idea putting ice cream',
       'say good say bad afternoon httpplurkcompwxpdj', ...,
       'playing sudoku mommy make breakfast amp lunch',
       'see u bye see u love hot', 'ha ha game like game'], dtype=object)

### Features with tensorflow tokenizer

In [8]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [9]:
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(text_train_array)
word_index = tokenizer.word_index
print(len(word_index))

26308


In [10]:
sequences = tokenizer.texts_to_sequences(text_train_array)
padded = pad_sequences(sequences, padding='post')
print(padded[0])
print(padded.shape)

[ 587 1083   42  502  189 9273 1162   17  189  122   48  363   42    0
    0    0    0    0    0    0    0    0    0]
(27486, 23)


In [11]:
label_tokenizer = Tokenizer()
label_tokenizer.fit_on_texts(label_train_array)
label_word_index = label_tokenizer.word_index
label_seq = label_tokenizer.texts_to_sequences(label_train_array)
print(label_seq)
print(label_word_index)

[[1], [2], [1], [3], [2], [3], [2], [2], [2], [1], [1], [1], [3], [2], [1], [3], [3], [3], [3], [1], [1], [3], [3], [2], [2], [1], [2], [1], [1], [2], [1], [1], [2], [1], [2], [3], [3], [2], [3], [1], [2], [2], [1], [3], [1], [3], [1], [2], [3], [2], [1], [2], [3], [1], [2], [1], [3], [3], [3], [3], [3], [1], [2], [3], [1], [3], [3], [2], [1], [2], [1], [1], [1], [3], [1], [1], [1], [1], [3], [1], [2], [1], [2], [1], [2], [2], [1], [2], [2], [2], [1], [1], [1], [2], [1], [1], [3], [2], [1], [1], [3], [1], [3], [1], [2], [1], [2], [2], [1], [2], [1], [1], [3], [1], [2], [3], [3], [1], [3], [1], [1], [1], [3], [3], [1], [1], [2], [2], [2], [3], [1], [3], [1], [2], [1], [1], [3], [1], [1], [1], [1], [2], [3], [2], [1], [3], [3], [2], [3], [2], [2], [1], [1], [2], [2], [1], [3], [2], [2], [1], [2], [1], [3], [2], [2], [1], [1], [2], [1], [2], [1], [2], [1], [1], [1], [1], [3], [1], [1], [2], [1], [1], [3], [2], [2], [1], [1], [1], [2], [2], [1], [1], [2], [3], [2], [3], [3], [3], [3], [2],