In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm

In [2]:
data = pd.read_csv('task1/train.tsv',sep='\t')

In [3]:
data.describe()

Unnamed: 0,PhraseId,SentenceId,Sentiment
count,156060.0,156060.0,156060.0
mean,78030.5,4079.732744,2.063578
std,45050.785842,2502.764394,0.893832
min,1.0,1.0,0.0
25%,39015.75,1861.75,2.0
50%,78030.5,4017.0,2.0
75%,117045.25,6244.0,3.0
max,156060.0,8544.0,4.0


In [4]:
phrases = data['Phrase'].values
y = data['Sentiment'].values

In [5]:
def build_vocab(phrases):
    vocab = set()
    for phrase in phrases:
        words = phrase.split() 
        vocab.update(words)
    return list(vocab)

vocab = build_vocab(phrases)

In [6]:
def text_2_vector(text, vocab):
    vector = np.zeros(len(vocab))
    words = text.split()
    for word in words:
        vector[vocab.index(word)] += 1
    return vector

In [7]:
X = np.array([text_2_vector(phrase, vocab) for phrase in phrases])

KeyboardInterrupt: 

In [None]:
X_with_bias = np.concatenate([X, np.ones((X.shape[0], 1))], axis=1)

In [None]:
def ont_hot(y):
    num_classes = y.max() - y.min() + 1
    return np.eye(num_classes)[y]

In [None]:
ont_hot_y = ont_hot(y)

In [None]:
def softmax(z):
    exp_z = np.exp(z)
    return exp_z / exp_z.sum(axis=1, keepdims=True)

In [None]:
def propagation(X,y,W):
    n = X.shape[0]
    z = np.dot(X,W)
    a = softmax(z)
    loss = -np.mean(y*np.log(a))
    dz = a - y
    dw = 1/n * np.dot(X.T, dz)
    return loss, dw

In [None]:
def train(X, y, epochs=1000, batch_size=10, lr=0.001):
    w = np.random.rand(X.shape[1], 5)
    loss_history = []
    for _ in tqdm(range(epochs)):
        index = np.random.permutation(X.shape[0])
        for i in range(0, X.shape[0], batch_size):
            X_batch = X[index[i:i+batch_size]]
            y_batch = y[index[i:i+batch_size]]
            loss, dw = propagation(X_batch, y_batch, w)
            loss_history.append(loss)
            w = w - dw * lr
    return w, loss

In [14]:
X_with_bias

array([[0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.]])

: 

In [15]:
train(X_with_bias, ont_hot_y, batch_size=100000)

  0%|          | 0/1000 [00:00<?, ?it/s]