In [19]:
import pandas as pd
from nltk.corpus import stopwords
from nltk import RegexpTokenizer
import nltk

# Download NLTK stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Sabharjan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [20]:
def tokenize_overview(mydata, overview_col):
    # Function to clean show overview
    # Return: each row as a list of tokens

    # removes punctuation
    tokenizer = RegexpTokenizer(r"\w+")
    stop_words = stopwords.words("English")

    # split text
    tokens = mydata[overview_col].map(lambda x: tokenizer.tokenize(str(x)))

    # strip white spaces & lower case
    tokens = tokens.map(lambda x: [i.lower().strip("_") for i in x])

    # remove stop words
    tokens = tokens.map(lambda x: [i for i in x if i not in stop_words])

    # remove empty strings
    tokens = tokens.map(lambda x: [i for i in x if i != ''])

    return tokens

In [21]:
def preprocess_train_data(train_file_path):
    data = []
    try:
        with open(train_file_path, 'r', encoding='utf-8') as file:
            for line in file:
                split_line = line.strip().split(' ::: ')
                data.append(split_line)
    except UnicodeDecodeError as e:
        print(f"UnicodeDecodeError: {e}")

    columns = ['ID', 'TITLE', 'GENRE', 'DESCRIPTION']
    rows = data[1:]
    train_data = pd.DataFrame(rows, columns=columns)

    description_column_name = 'DESCRIPTION'
    if description_column_name in train_data.columns:
        train_data['TOKENIZED'] = tokenize_overview(train_data, description_column_name)
        train_data['CLEANED_DESCRIPTION'] = train_data['TOKENIZED'].map(lambda x: ' '.join(x))
    else:
        print(f"The specified column '{description_column_name}' does not exist in the dataset.")
    
    return train_data

In [22]:
def preprocess_test_data(test_file_path):
    data = []
    try:
        with open(test_file_path, 'r', encoding='utf-8') as file:
            for line in file:
                split_line = line.strip().split(' ::: ')
                data.append(split_line)
    except UnicodeDecodeError as e:
        print(f"UnicodeDecodeError: {e}")

    columns = ['ID', 'TITLE', 'DESCRIPTION']
    rows = data[1:]
    test_data = pd.DataFrame(rows, columns=columns)

    description_column_name = 'DESCRIPTION'
    if description_column_name in test_data.columns:
        test_data['TOKENIZED'] = tokenize_overview(test_data, description_column_name)
        test_data['CLEANED_DESCRIPTION'] = test_data['TOKENIZED'].map(lambda x: ' '.join(x))
    else:
        print(f"The specified column '{description_column_name}' does not exist in the dataset.")
    
    return test_data

In [23]:
if __name__ == "__main__":
    train_file_path = 'dataset/train_data.txt'
    test_file_path = 'dataset/test_data.txt'

    processed_train_data = preprocess_train_data(train_file_path)
    processed_test_data = preprocess_test_data(test_file_path)

    processed_train_data.to_csv('dataset/processed_train_data.csv', index=False)
    processed_test_data.to_csv('dataset/processed_test_data.csv', index=False)

    print("Data preprocessing completed.")

Data preprocessing completed.


In [24]:
# Manually set the column names
columns = ['ID', 'TITLE', 'GENRE', 'DESCRIPTION']
rows = data[1:]  # The rest are data rows
mydata = pd.DataFrame(rows, columns=columns)

# Print the first few rows and columns to debug
print("Data Columns:", mydata.columns)
print("First few rows of data:")
print(mydata.head())

Data Columns: Index(['ID', 'TITLE', 'GENRE', 'DESCRIPTION'], dtype='object')
First few rows of data:
  ID                             TITLE        GENRE  \
0  2                      Cupid (1997)     thriller   
1  3  Young, Wild and Wonderful (1980)        adult   
2  4             The Secret Sin (1915)        drama   
3  5            The Unrecovered (2007)        drama   
4  6            Quality Control (2011)  documentary   

                                         DESCRIPTION  
0  A brother and sister with a past incestuous re...  
1  As the bus empties the students for their fiel...  
2  To help their unemployed father make ends meet...  
3  The film's title refers not only to the un-rec...  
4  Quality Control consists of a series of 16mm s...  


In [18]:

tokenized_overviews = tokenize_overview(mydata, description_column_name)
print(tokenized_overviews.head())


0    [brother, sister, past, incestuous, relationsh...
1    [bus, empties, students, field, trip, museum, ...
2    [help, unemployed, father, make, ends, meet, e...
3    [film, title, refers, un, recovered, bodies, g...
4    [quality, control, consists, series, 16mm, sin...
Name: DESCRIPTION, dtype: object
