In [1]:
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import PorterStemmer, SnowballStemmer
from nltk.stem import WordNetLemmatizer

In [2]:
def clean_review(review):
    '''
    Input:
        review: a string containing a review.
    Output:
        review_cleaned: a processed review. 
    '''
    review = review[0] if type(review) != str else review
    if type(review) == np.ndarray:
        print(review, review[0])
        review = review[0]
        
    lower_string = review.lower()
    no_url_str = re.sub(r'https?:\/\/\.*','', lower_string)
    clean_str = re.sub(r'[^a-zA-Z]', ' ', no_url_str)  # removing special characters, numbers, punctuations
#     print(clean_str)
    
    stop_words_set = set(stopwords.words('english'))
    stemmer_object = PorterStemmer()  # SnowballStemmer(language='english')
#     lemma_obj = WordNetLemmatizer()
  
    review_str_tokens = word_tokenize(clean_str)
    clean_word_list = [stemmer_object.stem(a_token) for a_token in review_str_tokens if not a_token.lower() in stop_words_set]
    
    clean_review = ' '.join(clean_word_list)
    return clean_review




In [3]:
# Verifying
clean_review('https://nlp.stanford.edu/IR-book/html/htmledition/stemming-and-lemmatization-1.html BuckyBarnes@123 \!@#$%^&*\(\)_+=-[]{}\|\';:\"/?.,><`~')

'nlp stanford edu ir book html htmledit stem lemmat html buckybarn'

In [4]:
import os

In [5]:
cwd = os.getcwd()

description = []
genres = []

with open(rf'{cwd}\Raw_Data\Genre Classification Dataset\train_data.txt', 'r', encoding='utf-8') as train_data_file2:
    
    for line in train_data_file2.readlines():
        _id,title,genre,desc = line.split(' ::: ')
        description.append(f"{desc.lower().strip()}")
        genres.append(f"{genre.lower().strip()}")
        

In [6]:
description[0]

'listening in to a conversation between his doctor and parents, 10-year-old oscar learns what nobody has the courage to tell him. he only has a few weeks to live. furious, he refuses to speak to anyone except straight-talking rose, the lady in pink he meets on the hospital stairs. as christmas approaches, rose uses her fantastical experiences as a professional wrestler, her imagination, wit and charm to allow oscar to live life and love to the full, in the company of his friends pop corn, einstein, bacon and childhood sweetheart peggy blue.'

In [7]:
genres[0]

'drama'

In [8]:
import pandas as pd

In [9]:
df = pd.DataFrame(list(zip(description, genres)), columns =['Description', 'Genre'])

In [10]:
df['Description'] = df['Description'].apply(clean_review)
df

Unnamed: 0,Description,Genre
0,listen convers doctor parent year old oscar le...,drama
1,brother sister past incestu relationship curre...,thriller
2,bu empti student field trip museum natur histo...,adult
3,help unemploy father make end meet edith twin ...,drama
4,film titl refer un recov bodi ground zero also...,drama
...,...,...
54209,short live nbc live sitcom center bonino world...,comedy
54210,next gener exploit sister kapa bay soror hous ...,horror
54211,ze bestaan echt stand comedi grow face fear fr...,documentary
54212,walter vivian live countri difficult time keep...,comedy


In [11]:
df.to_csv('Train_Data.csv', index=False)

In [12]:
vocab = {x for y in df['Description'] for x in y.strip().split()}

In [13]:
len(vocab)

89311

In [14]:
# maximum (word) length of a review.
max([len(x.split()) for x in df['Description']])

1614

In [15]:
cwd = os.getcwd()

description = []
genres = []

with open(rf'{cwd}\Raw_Data\Genre Classification Dataset\test_data_solution.txt', 'r', encoding='utf-8') as test_data_solution_file:
    
    for line in test_data_solution_file.readlines():
        _id,title,genre,desc = line.split(' ::: ')
        description.append(f"{desc.lower().strip()}")
        genres.append(f"{genre.lower().strip()}")


In [16]:
df = pd.DataFrame(list(zip(description, genres)), columns =['Description', 'Genre'])

In [17]:
df['Description'] = df['Description'].apply(clean_review)
df

Unnamed: 0,Description,Genre
0,l r brane love life car apart job especi girlf...,thriller
1,spain march quico naughti child three belong w...,comedy
2,one year life albin famili shepherd north tran...,documentary
3,father die spoken brother year seriou cancer d...,drama
4,known intern martial art superstar bruce lee a...,drama
...,...,...
54195,cover multipl genr tale light dark antholog we...,horror
54196,alic cora munro attempt find father british of...,western
54197,movi year make oliv twist art dodger fagin lov...,adult
54198,popular mysteri rock j mike mallard askew broa...,drama


In [18]:
df.to_csv('Test_Data.csv', index=False)