<a href="https://colab.research.google.com/github/Niroth36/Text_Classification_Natural_Language_Processing/blob/main/Text_Classification_Natural_Language_Processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing libraries

In [1]:
import matplotlib.pyplot as plt
import os
import re
import shutil
import string
import numpy as np
import tensorflow as tf
from tensorflow import keras
import pandas as pd

from keras import layers
from keras import losses
from keras import preprocessing
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

# Importing dataset

Download the CSV file

In [54]:
url = "https://raw.githubusercontent.com/Niroth36/Text_Classification_Natural_Language_Processing/main/dataset.csv.gz"
filename = "dataset.csv.gz"

path = tf.keras.utils.get_file(filename, url)

In [55]:
# load data with pandas
df = pd.read_csv(path, compression='gzip')

In [56]:
df.isnull().sum()

Unnamed: 0            0
Release Year          0
Title                 0
Origin/Ethnicity      0
Director              0
Cast                486
Wiki Page             0
Plot                  0
Genre                 0
dtype: int64

In [57]:
df= df.drop("Cast", axis='columns')

I decided to drop Cast column because it has missing values and it has no point in categorizing the movies according to their genre.

In [58]:
def preprocess_text(text):
    # convert text to lowercase
    text = text.lower()

    # remove special characters and digits
    text = re.sub(r'[^\w\s]', '', text)

    # remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    return text

Extract the file and read the CSV file using pandas

The first line creates a dictionary genre2id that maps each unique genre in the Genre column of df to a unique integer ID. The genres are sorted alphabetically using the sorted() function and then enumerated using the enumerate() function, starting from 0. The resulting dictionary genre2id has keys that are the unique genre names and values that are the corresponding integer IDs.
The second line maps the genre names in the Genre column of df to the corresponding integer IDs using the map() method. The resulting integer IDs are added as a new column label in df.

The integer IDs are useful for training machine learning models, as many models require integer labels instead of string labels.

This function should load the data from the CSV file at file_path, preprocess it as described, and return three tf.data.Dataset objects for training, validation, and testing, respectively. The batch_size, p_train, and p_val parameters control the size of the batches and the proportion of the data used for training, validation, and testing, respectively. The function also returns the number of labels in the dataset as n_labels.

In [101]:
import numpy as np
import pandas as pd
import tensorflow as tf

def load_data_wiki(file_path, batch_size=32, p_train=.65, p_val=0.15):
    # load data with pandas
    df = pd.read_csv(file_path, compression='gzip')

    # randomize the data, (shuffle rows)
    df = df.sample(frac=1, random_state=42)

    # preprocess text data
    df['Plot'] = df['Plot'].apply(preprocess_text)
    df['Title'] = df['Title'].apply(preprocess_text)

    # convert Origin/Ethnicity to one-hot encoding
    ethnicity_onehot = pd.get_dummies(df['Origin/Ethnicity'], prefix='Ethnicity')

    # concatenate the data and one-hot encoded ethnicity
    df = pd.concat([df, ethnicity_onehot], axis=1)

    # one-hot encode the genre labels
    genre_onehot = pd.get_dummies(df['Genre'], prefix='Genre')

    # concatenate the data and one-hot encoded genre
    df = pd.concat([df, genre_onehot], axis=1)

    # split data into train, val, and test
    train_size = int(p_train * len(df))
    val_size = int(p_val * len(df))
    test_size = len(df) - train_size - val_size

    train_df = df[:train_size]
    val_df = df[train_size:train_size + val_size]
    test_df = df[train_size + val_size:]
        
    # convert dataframes to dictionaries
    ethnicity_cols = [col for col in df.columns if col.startswith('Ethnicity_')]
    genre_cols = [col for col in df.columns if col.startswith('Genre_')]
    
    train_title = np.array(train_df['Title'])
    train_plot = np.array(train_df['Plot'])
    train_ethnicity = np.array(train_df[ethnicity_cols])
    train_genre = np.array(train_df[genre_cols])

    val_title = np.array(val_df['Title'])
    val_plot = np.array(val_df['Plot'])
    val_ethnicity = np.array(val_df[ethnicity_cols])
    val_genre = np.array(val_df[genre_cols])

    test_title = np.array(test_df['Title'])
    test_plot = np.array(test_df['Plot'])
    test_ethnicity = np.array(test_df[ethnicity_cols])
    test_genre = np.array(test_df[genre_cols])
    
    train_data = {'Title': train_title, 'Plot': train_plot, 'Ethnicity': train_ethnicity, 'Genre': train_genre}
    val_data = {'Title': val_title, 'Plot': val_plot, 'Ethnicity': val_ethnicity, 'Genre': val_genre}
    test_data = {'Title': test_title, 'Plot': test_plot, 'Ethnicity': test_ethnicity, 'Genre': test_genre}

    # create tf.data.Dataset objects
    train_ds = tf.data.Dataset.from_tensor_slices(train_data)
    train_ds = train_ds.shuffle(buffer_size=train_size)
    train_ds = train_ds.batch(batch_size)

    val_ds = tf.data.Dataset.from_tensor_slices(val_data)
    val_ds = val_ds.shuffle(buffer_size=val_size)
    val_ds = val_ds.batch(batch_size)

    test_ds = tf.data.Dataset.from_tensor_slices(test_data)
    test_ds = test_ds.batch(batch_size)

    # return datasets and number of labels
    return train_ds, val_ds, test_ds, len(genre_onehot.columns)

In [102]:
raw_train_ds, raw_val_ds, raw_test_ds, n_labels = load_data_wiki(path)

In [103]:
print(n_labels)

20


In [114]:
# Peek at the first batch of the training dataset
for batch in raw_train_ds.take(1):
    print(batch)

{'Title': <tf.Tensor: shape=(32,), dtype=string, numpy=
array([b'the flame of new orleans', b'the next of kin',
       b'the old barn dance', b'die laughing', b'tiger cage',
       b'velugu needalu', b'shout at the devil', b'downhill racer',
       b'aadu', b'a price above rubies', b'jungle goddess',
       b'the girl next door', b'gunga din', b'veeranna', b'parugu',
       b'stanley  iris', b'night crossing', b'trail of the vigilantes',
       b'under the hawthorn tree', b'the dark past', b'lets get harry',
       b'derby day', b'gumrah', b'where is parsifal',
       b'simon king of the witches', b'shanghai story',
       b'what just happened', b'darkness falls', b'wrong is right',
       b'the pink panther 2', b'aadu magaadra bujji',
       b'billy elliot the musical live'], dtype=object)>, 'Plot': <tf.Tensor: shape=(32,), dtype=string, numpy=
array([b'the legend of claire of new orleans is born after two fishermen find a wedding dress floating around on the mississippi river one day

In [119]:
# Define max number of words
max_features = 500

# Create vectorization layer
vectorize_layer = TextVectorization(
    max_tokens=max_features,
    output_mode='binary',
    pad_to_max_tokens=True)

# Adapt vectorization layer to training set
train_titles = raw_train_ds.map(lambda x: x['Title'])
vectorize_layer.adapt(train_titles)

# Adapt vectorization layer to validation set
val_titles = raw_val_ds.map(lambda x: x['Title'])
vectorize_layer.adapt(val_titles)

# Adapt vectorization layer to test set
test_titles = raw_test_ds.map(lambda x: x['Title'])
vectorize_layer.adapt(test_titles)

# Define function to vectorize text
def vectorize_text(text):
  return vectorize_layer(tf.expand_dims(text, -1))

# Vectorize titles in datasets and create new datasets
train_ds = raw_train_ds.map(lambda x: (vectorize_text(x['Title']), x['Genre']))
val_ds = raw_val_ds.map(lambda x: (vectorize_text(x['Title']), x['Genre']))
test_ds = raw_test_ds.map(lambda x: (vectorize_text(x['Title']), x['Genre']))

# Define the model
model = tf.keras.Sequential([
    layers.Dense(20, activation='sigmoid', input_shape=(max_features,))
])

# Compile the model
model.compile(
    loss=losses.BinaryCrossentropy(),
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    metrics=['accuracy']
)

# Train the model
epochs = 20
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=epochs
)

# Evaluate the model
loss, accuracy = model.evaluate(test_ds)
print("Accuracy: ", accuracy)
print("Loss: ", loss)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Accuracy:  0.313721239566803
Loss:  0.15691672265529633


In [120]:
# Define max number of words
max_features = 10000

# Create vectorization layer
vectorize_layer = TextVectorization(
    max_tokens=max_features,
    output_mode='binary',
    pad_to_max_tokens=True)

# Adapt vectorization layer to training set
train_titles = raw_train_ds.map(lambda x: x['Title'])
vectorize_layer.adapt(train_titles)

# Adapt vectorization layer to validation set
val_titles = raw_val_ds.map(lambda x: x['Title'])
vectorize_layer.adapt(val_titles)

# Adapt vectorization layer to test set
test_titles = raw_test_ds.map(lambda x: x['Title'])
vectorize_layer.adapt(test_titles)

# Define function to vectorize text
def vectorize_text(text):
  return vectorize_layer(tf.expand_dims(text, -1))

# Vectorize titles in datasets and create new datasets
train_ds = raw_train_ds.map(lambda x: (vectorize_text(x['Title']), x['Genre']))
val_ds = raw_val_ds.map(lambda x: (vectorize_text(x['Title']), x['Genre']))
test_ds = raw_test_ds.map(lambda x: (vectorize_text(x['Title']), x['Genre']))

# Define the model
model = tf.keras.Sequential([
    layers.Dense(20, activation='sigmoid', input_shape=(max_features,))
])

# Compile the model
model.compile(
    loss=losses.BinaryCrossentropy(),
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    metrics=['accuracy']
)

# Train the model
epochs = 20
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=epochs
)

# Evaluate the model
loss, accuracy = model.evaluate(test_ds)
print("Accuracy: ", accuracy)
print("Loss: ", loss)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Accuracy:  0.35692575573921204
Loss:  0.15941157937049866
