<a href="https://colab.research.google.com/github/Niroth36/Text_Classification_Natural_Language_Processing/blob/main/Text_Classification_Natural_Language_Processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing libraries

In [1]:
import matplotlib.pyplot as plt
import os
import re
import shutil
import string
import numpy as np
import tensorflow as tf
from tensorflow import keras
import pandas as pd

from keras import layers
from keras import losses
from keras import preprocessing
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

# Importing dataset

Download the CSV file

In [2]:
url = "https://raw.githubusercontent.com/Niroth36/Text_Classification_Natural_Language_Processing/main/dataset.csv.gz"
filename = "dataset.csv.gz"

path = tf.keras.utils.get_file(filename, url)

Downloading data from https://raw.githubusercontent.com/Niroth36/Text_Classification_Natural_Language_Processing/main/dataset.csv.gz


In [4]:
# load data with pandas
df = pd.read_csv(path, compression='gzip')

In [5]:
df.isnull().sum()

Unnamed: 0            0
Release Year          0
Title                 0
Origin/Ethnicity      0
Director              0
Cast                486
Wiki Page             0
Plot                  0
Genre                 0
dtype: int64

In [7]:
df= df.drop("Cast", axis='columns')

I decided to drop Cast column because it has missing values and it has no point in categorizing the movies according to their genre.

In [6]:
import re
import string

def preprocess_text(text):
    # convert text to lowercase
    text = text.lower()

    # remove special characters and digits
    text = re.sub(r'[^\w\s]', '', text)

    # remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    return text

Extract the file and read the CSV file using pandas

The first line creates a dictionary genre2id that maps each unique genre in the Genre column of df to a unique integer ID. The genres are sorted alphabetically using the sorted() function and then enumerated using the enumerate() function, starting from 0. The resulting dictionary genre2id has keys that are the unique genre names and values that are the corresponding integer IDs.
The second line maps the genre names in the Genre column of df to the corresponding integer IDs using the map() method. The resulting integer IDs are added as a new column label in df.

The integer IDs are useful for training machine learning models, as many models require integer labels instead of string labels.

This function should load the data from the CSV file at file_path, preprocess it as described, and return three tf.data.Dataset objects for training, validation, and testing, respectively. The batch_size, p_train, and p_val parameters control the size of the batches and the proportion of the data used for training, validation, and testing, respectively. The function also returns the number of labels in the dataset as n_labels.

In [7]:
def load_data_wiki(file_path, batch_size=32, p_train=.65, p_val=0.15):
    # load data with pandas
    df = pd.read_csv(file_path, compression='gzip')

    # drop cast column
    df= df.drop("Cast", axis='columns')

    # randomize the data, (shuffle rows)
    df = df.sample(frac=1, random_state=42)

    # preprocess text data
    df['Plot'] = df['Plot'].apply(preprocess_text)

    # convert Origin/Ethnicity to one-hot encoding
    ethnicity_onehot = pd.get_dummies(df['Origin/Ethnicity'], prefix='Ethnicity')

    # concatenate the data and one-hot encoded ethnicity
    df = pd.concat([df, ethnicity_onehot], axis=1)

    # convert the genre labels to integer IDs
    genre2id = {g: i for i, g in enumerate(sorted(df['Genre'].unique()))}
    df['label'] = df['Genre'].map(genre2id)

    # split data into train, val, and test
    train_size = int(p_train * len(df))
    val_size = int(p_val * len(df))
    test_size = len(df) - train_size - val_size

    train_df = df[:train_size]
    val_df = df[train_size:train_size + val_size]
    test_df = df[train_size + val_size:]
        
    # convert dataframes to dictionaries
    ethnicity_cols = [col for col in df.columns if col.startswith('Ethnicity_')]
    train_data = dict(train_df[['Title', 'Plot'] + ethnicity_cols].items())
    train_data['label'] = np.array(train_df['label'].tolist())

    val_data = dict(val_df[['Title', 'Plot'] + ethnicity_cols].items())
    val_data['label'] = np.array(val_df['label'].tolist())

    test_data = dict(test_df[['Title', 'Plot'] + ethnicity_cols].items())
    test_data['label'] = np.array(test_df['label'].tolist())

    # create tf.data.Dataset objects
    train_ds = tf.data.Dataset.from_tensor_slices(train_data)
    train_ds = train_ds.shuffle(buffer_size=train_size)
    train_ds = train_ds.batch(batch_size)

    val_ds = tf.data.Dataset.from_tensor_slices(val_data)
    val_ds = val_ds.shuffle(buffer_size=val_size)
    val_ds = val_ds.batch(batch_size)

    test_ds = tf.data.Dataset.from_tensor_slices(test_data)
    test_ds = test_ds.batch(batch_size)

    # return datasets and number of labels
    return train_ds, val_ds, test_ds, len(genre2id)


In [13]:
raw_train_ds, raw_val_ds, raw_test_ds, n_labels = load_data_wiki(path)

In [14]:
print(n_labels)

20


In [9]:
for element in raw_train_ds.take(1):
    print(element)


{'Title': <tf.Tensor: shape=(32,), dtype=string, numpy=
array([b'Roudram', b'In the Land of Women', b'The Spoilers',
       b'The Reckoning', b'Angel on My Shoulder', b'Carry On Teacher',
       b'Spellcaster', b'The Seeker', b'Pink Floyd \xe2\x80\x93 The Wall',
       b'Purple People Eater', b'Green Fire', b'Assassin', b'Iyer IPS',
       b'Silambattam', b'Mudhoney', b'Knight Without Armour', b'Billa',
       b'The Fighting Marshal', b'The She-Creature',
       b'Storm over Wyoming', b'Noor Jahaan', b'Pecker', b'Hungry Hill',
       b' Janky Promoters', b'Detachment', b'Raid on Rommel',
       b'Thunder Over the Plains', b'Anchors Aweigh',
       b'The Affairs of Martha', b'Big Trouble', b'What About Bob?',
       b'KL 10 Patthu'], dtype=object)>, 'Plot': <tf.Tensor: shape=(32,), dtype=string, numpy=
array([b'the film starts in the 1980s a young shiva jiiva is trained in the ancient indian martial art kalarippayattu by his grandfather prakash raj to be bold and brave and fight against

In [24]:
train_df = pd.DataFrame.from_dict(dict(raw_train_ds))
val_df = pd.DataFrame.from_dict(dict(raw_val_ds))
test_df = pd.DataFrame.from_dict(dict(raw_test_ds))

ValueError: ignored

In [15]:
# define parameters
max_features = 500
batch_size = 32
epochs = 10
learning_rate = 0.001

# load data
train_ds, val_ds, test_ds, n_labels = load_data_wiki(path)

# create text vectorization layer
text_vectorizer = TextVectorization(max_tokens=max_features, output_mode='binary')
train_text = train_ds.pop('Title')
text_vectorizer.adapt(train_text)

# define the model
model = tf.keras.Sequential([
    text_vectorizer,
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(n_labels, activation='softmax')
])

# compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# train the model
history = model.fit(train_ds, epochs=epochs, validation_data=val_ds, verbose=2)

# evaluate the model on test data
test_loss, test_acc = model.evaluate(test_ds)
print('Test accuracy:', test_acc)


AttributeError: ignored

The dictionary can now be used with tf.data.Dataset.from_tensor_slices to create the training dataset. The same process can be followed for the validation and test sets. Be sure to print intermediate results (dataframes, datasets) to ensure that the datasets are in the expected format before using them for training.

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf

# Load the movie plots dataset
df = pd.read_csv(path, compression='gzip')

# Preprocess the titles column
def preprocess_titles(title):
    # Remove parentheses and their contents
    title = re.sub(r'\([^)]*\)', '', title)
    # Remove any remaining punctuation
    title = re.sub(r'[^\w\s]', '', title)
    # Convert to lowercase
    title = title.lower()
    return title

df['Title'] = df['Title'].apply(preprocess_titles)

# Convert Origin/Ethnicity and Genre to one-hot encoded variables
df = pd.get_dummies(df, columns=['Origin/Ethnicity', 'Genre'])

# Split the dataset into train, validation, and test sets
train_df, val_df, test_df = np.split(df.sample(frac=1, random_state=42), [int(0.6*len(df)), int(0.8*len(df))])

# Convert the dataframes to dictionaries
train_dict = {'Title': train_df['Title'].values, 'Origin/Ethnicity': train_df.filter(like='Origin/Ethnicity').values.astype(np.float64), 'Plot': train_df['Plot'].values, 'Genre': train_df.filter(like='Genre').values.astype(np.float64)}
val_dict = {'Title': val_df['Title'].values, 'Origin/Ethnicity': val_df.filter(like='Origin/Ethnicity').values.astype(np.float64), 'Plot': val_df['Plot'].values, 'Genre': val_df.filter(like='Genre').values.astype(np.float64)}
test_dict = {'Title': test_df['Title'].values, 'Origin/Ethnicity': test_df.filter(like='Origin/Ethnicity').values.astype(np.float64), 'Plot': test_df['Plot'].values, 'Genre': test_df.filter(like='Genre').values.astype(np.float64)}

# Create tf.data.Dataset objects from the dictionaries
train_dataset = tf.data.Dataset.from_tensor_slices(train_dict)
val_dataset = tf.data.Dataset.from_tensor_slices(val_dict)
test_dataset = tf.data.Dataset.from_tensor_slices(test_dict)

# Print the first few entries in the training dataset
for elem in train_dataset.take(5):
    print(elem)


{'Title': <tf.Tensor: shape=(), dtype=string, numpy=b'creature with the atom brain'>, 'Origin/Ethnicity': <tf.Tensor: shape=(24,), dtype=float64, numpy=
array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0.])>, 'Plot': <tf.Tensor: shape=(), dtype=string, numpy=b"A hulking zombie breaks into a mansion and kills a gangster named Hennesy. The blood stains left behind at the crime scene are radioactive, and the fingerprints of the killer are of a man who had died days before the murder; the police are baffled.\r\nGangster boss Frank Buchanan, who had been forced to flee the United States before he was deported, was betrayed by members of his own underworld gang. While traveling in Europe, he finds ex-Nazi scientist Wilhelm Steigg (Gaye), who is trying to reanimate the dead in order to provide a menial labor pool that is easily exploited. Buchanan funds the research and brings the scientist to America with the unstated goal of sending S

In this example, preprocess_titles() is a function that extracts the movie titles from the text data, and y is the target variable for classification (e.g., 1 for action movies, 0 for romance movies). The number of epochs is set to 50, but this value can be adjusted based on the performance of the model on the training data.

In [5]:
import pandas as pd
import numpy as np
import tensorflow as tf

# Load the movie plots dataset
df = pd.read_csv(path, compression='gzip')

# Preprocess the titles column
def preprocess_titles(title):
    # Remove parentheses and their contents
    title = re.sub(r'\([^)]*\)', '', title)
    # Remove any remaining punctuation
    title = re.sub(r'[^\w\s]', '', title)
    # Convert to lowercase
    title = title.lower()
    return title

df['Title'] = df['Title'].apply(preprocess_titles)

# Convert Origin/Ethnicity and Genre to one-hot encoded variables
df = pd.get_dummies(df, columns=['Origin/Ethnicity', 'Genre'])

# Split the dataset into train, validation, and test sets
train_df, val_df, test_df = np.split(df.sample(frac=1, random_state=42), [int(0.6*len(df)), int(0.8*len(df))])

# Convert the dataframes to dictionaries
train_dict = {'Title': train_df['Title'].values, 'Origin/Ethnicity': train_df.filter(like='Origin/Ethnicity').values.astype(np.float64), 'Plot': train_df['Plot'].values, 'Genre': train_df.filter(like='Genre').values.astype(np.float64)}
val_dict = {'Title': val_df['Title'].values, 'Origin/Ethnicity': val_df.filter(like='Origin/Ethnicity').values.astype(np.float64), 'Plot': val_df['Plot'].values, 'Genre': val_df.filter(like='Genre').values.astype(np.float64)}
test_dict = {'Title': test_df['Title'].values, 'Origin/Ethnicity': test_df.filter(like='Origin/Ethnicity').values.astype(np.float64), 'Plot': test_df['Plot'].values, 'Genre': test_df.filter(like='Genre').values.astype(np.float64)}

# Create tf.data.Dataset objects from the dictionaries
train_dataset = tf.data.Dataset.from_tensor_slices(train_dict)
val_dataset = tf.data.Dataset.from_tensor_slices(val_dict)
test_dataset = tf.data.Dataset.from_tensor_slices(test_dict)

# Print the first few entries in the training dataset
for elem in train_dataset.take(5):
    print(elem)


{'Title': <tf.Tensor: shape=(), dtype=string, numpy=b'creature with the atom brain'>, 'Origin/Ethnicity': <tf.Tensor: shape=(24,), dtype=float64, numpy=
array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0.])>, 'Plot': <tf.Tensor: shape=(), dtype=string, numpy=b"A hulking zombie breaks into a mansion and kills a gangster named Hennesy. The blood stains left behind at the crime scene are radioactive, and the fingerprints of the killer are of a man who had died days before the murder; the police are baffled.\r\nGangster boss Frank Buchanan, who had been forced to flee the United States before he was deported, was betrayed by members of his own underworld gang. While traveling in Europe, he finds ex-Nazi scientist Wilhelm Steigg (Gaye), who is trying to reanimate the dead in order to provide a menial labor pool that is easily exploited. Buchanan funds the research and brings the scientist to America with the unstated goal of sending S

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from sklearn.feature_extraction.text import CountVectorizer

# Preprocess the text data to extract only the movie title
titles = preprocess_titles("The Shawshank Redemption")

# Convert the titles into a binary bag-of-words representation with a vocabulary of 500 words
vectorizer = CountVectorizer(binary=True, max_features=500)
X = vectorizer.fit_transform(titles)

# Create a linear classification model with a single output layer
model = Sequential()
model.add(Dense(1, input_dim=X.shape[1], activation='sigmoid'))

# Compile the model using the Adam optimizer with a learning rate of 0.001 and the binary cross-entropy loss function
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model using the fit method of the Keras model
model.fit(X, y, epochs=50, batch_size=32)
