# Text Classification Model with Pytorch

### 1. Import related Libralies

In [1]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np
import re
from collections import Counter
import urllib
import tarfile


#set random seed
torch.manual_seed(42)

  from .autonotebook import tqdm as notebook_tqdm


<torch._C.Generator at 0x7f270004ed30>

### 2. Download Dataset

In [2]:
def download_imdb(data_dir):
    #create data directory
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)
        
    #Download Imdb dataset
    url="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
    file_path=os.path.join(data_dir,"aclImdb_v1.tar.gz")
    
    #Decompress file
    if not os.path.exists(file_path):
        print("Downloading IMDB dataset ...")
        urllib.request.urlretrieve(url,file_path)
        print("Downoload Complete!")
        
    if not os.path.exists(os.path.join(data_dir,"aclImdb")):
        print("Extracting the dataset!")
        with tarfile.open(file_path,"r:gz") as tar:
            tar.extractall(path=data_dir)
        print("Extraction Complete!")    
        
 #Download Data
data_dir="data_dir/imdb"
download_imdb(data_dir)   
 
        
    
    
                          

### 3. Load data

In [3]:
def load_imdb_data(data_dir):
    reviews=[]
    labels=[]
    
    for label_type in ['pos' ,'neg']:
        dir_name=os.path.join(data_dir, 'aclImdb','train',label_type)
        for fname in os.listdir(dir_name):
            if fname.endswith('.txt'):
                with open(os.path.join(dir_name, fname),'r',encoding='utf-8') as f:
                    reviews.append(f.read())
                labels.append(1 if label_type=='pos' else 0)
    return reviews, labels           

### 4. Preprocess data and build Vocabulary

In [4]:
# preprocess text_
def preprocess_text(text):
    text=re.sub(r'[^\w\s]','',text)
    text=text.lower()
    return text()

#Build Vocabulary
def build_vocab(reviews,max_vocab_size=10000):
    word_counter=Counter()
    for review in reviewers:
        word_counter.update(review.split())
    vocab={word:i+1 for i,(word,_) in enumerate(word_counter.most_common(max_vocab_size))}
    vocab['<PAD>']=0 # addition of padding characters
    return vocab
# Convert the text into index sequence
def text_to_sequence(text,vocab):
    return [vocab.get(word,0) for word in text.split()]
#Pad the sequences
def pad_sequence(seq,max_len):
    if len(seq)>=max_len:
        return seq[:max_len]
    else:
        return seq+[0]*(max_len-len(seq))
   

### 5. Define the dataset

In [5]:
from torch.utils.data import Dataset

class IMBDataset(Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]

### 6. Define the TxtCnn Model

In [6]:
class TextCNN(nn.Module):
    def __init__(self, vocab_size,embed_dim, num_classes, kernel_sizes=[3,4,5], num_filters=100):
        super(TextCNN,self).__init__()
        self.embedding=nn.Embedding(vocab_size,embed_dim)
        self.convs=nn.ModuleList([
            nn.Conv2d(1,num_filters, (k, embed_dim)) for k in kernel_sizes
        ])
        self.fc=nn.Linear(len(kernel_sizes)*num_filters,num_classes)
        self.dropout=nn.Dropout(0.5)
        
    def forward(self,x):
        x=self.embedding(x)
        x=x.unsqueeze(1)
        x=[torch.relu(conv(x)).seqqueeze(3) for conv in self.convs]
        x=[torch.max_pool1d(i,i.size(2)).squeeze(2) for i in x]
        x=torch.cat(x,1)
        x=self.dropout(x)
        x=self.fc(x)
        return x
    

### 7. Define the model Training and test functions

In [11]:
def train_model(model, train_loader, criterion, optimizer, device, num_epochs=10):
    model.to(device)
    model.train()
    
    for epoch in range(num_epochs):
        total_loss=0
        for batch_x ,batch_y in train_loader:
            batch_x,batch_y=batch_x.to(device),batch_y.to(device)
            optimizer.zero_grad()
            outputs=model(batch_x)
            loss=criterion(outputs,batch_y)
            loss.backward()
            optimizer.step()
            total_loss+=loss_item()
            
        print(f'Train-Epoch[{epoch+1}/{num_epoch}],Loss:{total_loss/len(train_loader):.4f}')
        
# Model Test Function
def test_model(model,test_loader, device):
    model.eval()
    correct=0
    total=0
    with torch.no_grad():
        for batch_x , batch_y in test_loader:
            batch_x, batch_y=batch_x.to(device), batch_y.to(device)
            outputs=model(batch_x)
            _,predicted=torch.max(outputs.data,1)
            total +=batch_y.size(0)
            correct +=(predicted==batch_y).sum().item()
            
    print(f'Test Accuracy:{100*correct/total:.2f}%')      
    
#Sentiment Prediction Function
def predict_sentiment(text,model, vocab, max_len=200):
    text=preprocess_text(text)
    sequence=pad_sequence(sequence,max_len)
    sequence=torch.tensor(sequence,dtype=torch.long).unsqueeze(0).to(device)
    
    with torch.no_grad():
        output=model(sequence)
        _,predicted=torch.max(output,1)
    return "pos" if predicted.item()==1 else "neg"
        


### 8. Train model on the main program , and test the model accuracy using the test set and save the model

In [None]:
#Main Program
if __name__ =="__main__":
    # Download and load
    data_dir="data_dir/imdb"
    download_imdb(data_dir)
    reviews, labels=load_imdb_data(data_dir)
    
    #preprocess data
    reviews=[preprocess_text(reviw ) for review in reviews]
    vocab=build_vocab(reviews)
    squences=[text_to_sequence(review,vocab) for review in reviews]
    
    #padd the sequence and split the dataset
    max_len=200
    x=[pad_sequence(seq,max_len) for seq in sequences]
    x=torch.tensor(x,dtype=torch.long)
    y=torch.tensor(labels,dtype=torch.long)
    
    x_train, x_test,y_train, y_test=train_test_split(x,y,test_size=0.2,random_state=42)
    
    
    