In [1]:
import os
import torch
import collections
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader

classes = ['World', 'Sports', 'Business', 'Sci/Tech']


In [2]:
df_train = pd.read_csv("/content/train (1).csv",header = None)
df_test = pd.read_csv("/content/test (1).csv" , header = None)

In [3]:
df_train.head(10)

Unnamed: 0,0,1,2
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli..."
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...
4,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco..."
5,3,"Stocks End Up, But Near Year Lows (Reuters)",Reuters - Stocks ended slightly higher on Frid...
6,3,Money Funds Fell in Latest Week (AP),AP - Assets of the nation's retail money marke...
7,3,Fed minutes show dissent over inflation (USATO...,USATODAY.com - Retail sales bounced back a bit...
8,3,Safety Net (Forbes.com),Forbes.com - After earning a PH.D. in Sociolog...
9,3,Wall St. Bears Claw Back Into the Black,"NEW YORK (Reuters) - Short-sellers, Wall Stre..."


In [4]:
df_train = df_train.rename(columns={0: 'class', 1: 'text' , 2:"text1"})
df_test = df_test.rename(columns={0: 'class', 1: 'text',2:"text1"})

In [5]:
df_train.head(10)

Unnamed: 0,class,text,text1
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli..."
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...
4,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco..."
5,3,"Stocks End Up, But Near Year Lows (Reuters)",Reuters - Stocks ended slightly higher on Frid...
6,3,Money Funds Fell in Latest Week (AP),AP - Assets of the nation's retail money marke...
7,3,Fed minutes show dissent over inflation (USATO...,USATODAY.com - Retail sales bounced back a bit...
8,3,Safety Net (Forbes.com),Forbes.com - After earning a PH.D. in Sociolog...
9,3,Wall St. Bears Claw Back Into the Black,"NEW YORK (Reuters) - Short-sellers, Wall Stre..."


In [6]:
df_train["texxt"] = df_train["text"] + df_train["text1"]
df_test["texxt"] = df_test["text"] + df_test["text1"]

In [7]:
df_train = df_train.drop(columns=["text","text1"])
df_test = df_test.drop(columns=["text","text1"])

In [8]:
train_dataset = df_train.to_numpy()
test_dataset = df_test.to_numpy()

In [9]:
train_dataset[0]

array([3,
       "Wall St. Bears Claw Back Into the Black (Reuters)Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again."],
      dtype=object)

In [10]:
from torchtext.data.utils import get_tokenizer
import torchtext

In [11]:
tokenizer  = get_tokenizer("basic_english")
tokenizer("hi hello")

['hi', 'hello']

In [12]:
counter = {}
for label , line in train_dataset:
  for word in tokenizer(line):
    if word not in counter:
      counter[word] = 1
    counter[word] +=1

print(len(counter))


161728


In [13]:
vocab = torchtext.vocab.vocab(counter , min_freq = 1)
vocab_size = len(vocab)

print(f"vocab size is {vocab_size}")

vocab size is 161728


In [14]:
stoi = vocab.get_stoi() #stoi create index for each and every word in the vocab
def encoder(x):

  st = []

  for i in tokenizer(x):
    try:
      st.append(stoi[i])
    except:
      pass
  return st

print(encoder("i love you baby girl"))

[606, 3312, 321, 2301, 6089]


In [15]:
def to_bow(text):
  x = torch.zeros(vocab_size , dtype=torch.float32)
  for i in encoder(text):
    if i<vocab_size:
      x[i]+=1
  return x

to_bow("i love you baby girl")

tensor([0., 0., 0.,  ..., 0., 0., 0.])

In [16]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

corpus = ['I like hot dogs.',
        'The dog ran fast.',
        'Its hot outside.',]

X = vectorizer.fit_transform(corpus)
vectorizer.transform(["My dog is hot"]).toarray()

array([[1, 0, 0, 1, 0, 0, 0, 0, 0]])

In [17]:
from torch.utils.data import DataLoader

def bowify(x):
  return (
      torch.LongTensor([t[0]-1 for t in x]),
      torch.stack([to_bow(t[1]) for t in x])
  )

train_d = DataLoader(train_dataset , batch_size=16 , collate_fn = bowify , shuffle = True)
test_d = DataLoader(test_dataset , batch_size = 16 , collate_fn = bowify , shuffle = True)

In [18]:
train_d.dataset[:9]

array([[3,
        "Wall St. Bears Claw Back Into the Black (Reuters)Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again."],
       [3,
        'Carlyle Looks Toward Commercial Aerospace (Reuters)Reuters - Private investment firm Carlyle Group,\\which has a reputation for making well-timed and occasionally\\controversial plays in the defense industry, has quietly placed\\its bets on another part of the market.'],
       [3,
        "Oil and Economy Cloud Stocks' Outlook (Reuters)Reuters - Soaring crude prices plus worries\\about the economy and the outlook for earnings are expected to\\hang over the stock market next week during the depth of the\\summer doldrums."],
       [3,
        'Iraq Halts Oil Exports from Main Southern Pipeline (Reuters)Reuters - Authorities have halted oil export\\flows from the main pipeline in southern Iraq after\\intelligence showed a rebel militia could strike\\infrastructure, an oil official said on Saturday.'],


In [19]:
model = torch.nn.Sequential(
    torch.nn.Linear(vocab_size ,  4),
    torch.nn.LogSoftmax(dim=1)
)
lr = 1e-5
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters() , lr = lr)

In [20]:
def cook(model,epoch = 1500):
  model.train()

  total_loss , count , i =0,0,0
  acc =0
  ac =[]
  for label , features in train_d:

    pred = model(features)
    loss = loss_fn(pred,label)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    total_loss += loss
    _,pred = torch.max(pred,1)
    acc +=(pred == label).sum()

    count +=len(label)


    if i%200 == 0:
      ac.append(acc.item()/count)

    if i>epoch:
      break

    i+=1

  return total_loss.item()/count , acc.item()/count , ac


In [24]:
loss , acc , ac= cook(model,epoch = 1000)
print(loss)
print("\n")
print(acc)
print("\n")
print(ac)

0.08251594497772034


0.8165543912175649


[0.875, 0.8121890547263682, 0.8128117206982544, 0.8121880199667221, 0.8154650436953808, 0.8164335664335665]
