This code mounts google drive to the golab notebook. It gives us to get access with google drive and data there.

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')
!ln -s /content/gdrive/My\ Drive/ /mydrive

Mounted at /content/gdrive


In [None]:
## Going to the path of the project
cd /content/gdrive/MyDrive/NLP_projects

/content/gdrive/MyDrive/NLP_projects


In [None]:
!ls

Dataset  Text_classification.ipynb  Uzbek_News_Dataset.zip


### **Importing Libraries**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import os

### **Unziping Dataset**

In [None]:
import zipfile
import os

# Specify the name of your ZIP file
zip_file_name = '/content/gdrive/MyDrive/NLP_projects/Uzbek_News_Dataset.zip'

# Create a directory to extract the contents
extract_dir = 'Uzb_News_Dataset'
os.makedirs(extract_dir, exist_ok=True)

# Unzip the file
with zipfile.ZipFile(zip_file_name, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

print(f'Files have been extracted to: {extract_dir}')


### **Loading Dataset**

In [None]:
def load_data_to_dataframe(base_folder):
    data = []

    # Loop through each folder in the base directory
    for class_label in os.listdir(base_folder):
        class_folder_path = os.path.join(base_folder, class_label)

        if os.path.isdir(class_folder_path):
            # Loop through each file in the class directory
            for filename in os.listdir(class_folder_path):
                file_path = os.path.join(class_folder_path, filename)
                # Ensure we're working with a text file
                if filename.endswith('.txt'):
                    with open(file_path, 'r', encoding='utf-8') as file:
                        text = file.read().strip()
                        # Append a tuple (number_of_samples, text, class_label) to the data list
                        data.append((len(data) + 1, text, class_label))

    # Create a DataFrame from the data list
    df = pd.DataFrame(data, columns=['number_of_samples', 'text', 'class_labels'])
    return df

In [None]:
# Example usage
base_folder = '/content/gdrive/MyDrive/NLP_projects/Uzb_News_Dataset/Uzbek_News_Dataset'
df = load_data_to_dataframe(base_folder)
df.head()

Unnamed: 0,number_of_samples,text,class_labels
0,1,Tesla barcha elektromobillarining narxini oshi...,Avto
1,2,UzAuto Motors — o‘rindiq isitish modeli va mag...,Avto
2,3,Lego’ning 358 mingdan ortiq detali yordamida t...,Avto
3,4,Namanganda “Moskvich” ariqqa ag‘darilib ketish...,Avto
4,5,Qo‘qonda YPX xodimini mashina ustida sudrab ke...,Avto


In [None]:
from sklearn.preprocessing import LabelEncoder

def encode_class_labels(df):
    # Initialize the LabelEncoder
    label_encoder = LabelEncoder()

    # Fit and transform the class labels
    df['encoded_labels'] = label_encoder.fit_transform(df['class_labels'])

    return df

df = encode_class_labels(df)
df.head()

Unnamed: 0,number_of_samples,text,class_labels,encoded_labels
0,1,Tesla barcha elektromobillarining narxini oshi...,Avto,0
1,2,UzAuto Motors — o‘rindiq isitish modeli va mag...,Avto,0
2,3,Lego’ning 358 mingdan ortiq detali yordamida t...,Avto,0
3,4,Namanganda “Moskvich” ariqqa ag‘darilib ketish...,Avto,0
4,5,Qo‘qonda YPX xodimini mashina ustida sudrab ke...,Avto,0


In [None]:
df.encoded_labels.unique() ### this code shows number of classes. In this dataset there are 3 classes

array([0, 1, 2])

In [None]:
### We then remove non alphanumeric characters as well as converting to all lower case from the text.

import re
def clean_txt(text):
    text = re.sub("'", "", text)
    text = re.sub("(\\W)+", " ", text)
    text = text.lower()
    return text

df['text']=df['text'].apply(clean_txt)
df.head()

Unnamed: 0,number_of_samples,text,class_labels,encoded_labels
0,1,tesla barcha elektromobillarining narxini oshi...,Avto,0
1,2,uzauto motors o rindiq isitish modeli va magni...,Avto,0
2,3,lego ning 358 mingdan ortiq detali yordamida t...,Avto,0
3,4,namanganda moskvich ariqqa ag darilib ketishi ...,Avto,0
4,5,qo qonda ypx xodimini mashina ustida sudrab ke...,Avto,0


In [None]:
### Splitting dataset into train, validation and test sets
from sklearn.model_selection import train_test_split
train, valid = train_test_split(df, test_size=0.4, random_state=35)
test, val = train_test_split(valid, test_size=0.5, random_state=35)
print(f'Number of training samples: {len(train)}')
print(f'Number of validation samples: {len(val)}')
print(f'Number of testing samples: {len(test)}')

Number of training samples: 44321
Number of validation samples: 14774
Number of testing samples: 14774


### **Fine-Tuning Bert Model**

In [None]:
!pip install tokenization
import tensorflow_hub as hub
import tokenization
module_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2'
bert_layer = hub.KerasLayer(module_url, trainable=True)



In [None]:
### Building model

def build_model(bert_layer, max_len=512):
    input_word_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    net = tf.keras.layers.Dense(64, activation='relu')(clf_output)
    net = tf.keras.layers.Dropout(0.2)(net)
    net = tf.keras.layers.Dense(32, activation='relu')(net)
    net = tf.keras.layers.Dropout(0.2)(net)
    out = tf.keras.layers.Dense(3, activation='softmax')(net)

    model = tf.keras.models.Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(tf.keras.optimizers.Adam(lr=1e-5), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    return model

In [None]:
from transformers import BertTokenizer

# Load the pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []

    for text in texts:
        tokens = tokenizer.tokenize(text)
        tokens = tokens[:max_len-2]
        input_sequence = ["[CLS]"] + tokens + ["[SEP]"]
        pad_len = max_len - len(input_sequence)

        tokens = tokenizer.convert_tokens_to_ids(input_sequence) + [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len

        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)

    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [None]:
### Encoding the samples for training
max_len = 150
train_input = bert_encode(train['text'], tokenizer, max_len=max_len)
val_input = bert_encode(val['text'], tokenizer, max_len=max_len)
test_input = bert_encode(test['text'], tokenizer, max_len=max_len)

In [None]:
model = build_model(bert_layer, max_len=max_len)
model.summary()



Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_word_ids (InputLayer  [(None, 150)]                0         []                            
 )                                                                                                
                                                                                                  
 input_mask (InputLayer)     [(None, 150)]                0         []                            
                                                                                                  
 segment_ids (InputLayer)    [(None, 150)]                0         []                            
                                                                                                  
 keras_layer (KerasLayer)    [(None, 768),                1094822   ['input_word_ids[0][0]',  

### **Train the model**

In [None]:
checkpoint = tf.keras.callbacks.ModelCheckpoint('model.h5', monitor='val_accuracy', save_best_only=True, verbose=1)
earlystopping = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=5, verbose=1)

train_history = model.fit(train_input, train['encoded_labels'], validation_data=(val_input, val["encoded_labels"]), epochs=3,
                          callbacks=[checkpoint, earlystopping], batch_size=32, verbose=1)

Epoch 1/3
Epoch 1: val_accuracy improved from -inf to 0.88608, saving model to model.h5


  saving_api.save_model(


Epoch 2/3
Epoch 2: val_accuracy did not improve from 0.88608
Epoch 3/3
Epoch 3: val_accuracy did not improve from 0.88608


### **Model Evaluation**

In [None]:
test_fin_input = test_input = bert_encode(test['text'], tokenizer, max_len=max_len)
model.load_weights('model.h5')
test_pred = model.predict(test_input)



In [None]:
%%time

_, test_acc = model.evaluate(test_input, test['encoded_labels'])

print("test acc: ", test_acc)

test acc:  0.8786381483078003
CPU times: user 14.3 s, sys: 7.39 s, total: 21.7 s
Wall time: 3min 22s


In [None]:
%%time
# model = create_model(max_seq_len = data.max_seq_len, lr = 1e-5)
model.load_weights("model.h5")

_, train_acc = model.evaluate(train_input, train['encoded_labels'])
_, val_acc = model.evaluate(val_input, val['encoded_labels'])
_, test_acc = model.evaluate(test_input, test['encoded_labels'])

print("train acc: ", train_acc)
print("validation acc: ", val_acc)
print("test acc: ", test_acc)

train acc:  0.882110059261322
validation acc:  0.8860836625099182
test acc:  0.8786381483078003
CPU times: user 1min 14s, sys: 33.8 s, total: 1min 48s
Wall time: 14min 28s


# **Thank you for your attention**