# IMPORTING LIBRARIES

In [1]:
import nltk
import numpy as np
import pandas as pd
from nltk.tag import pos_tag
from nltk.corpus import stopwords,wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

# IMPORTING DATA

In [2]:
data = pd.read_csv("Data/cleaned_data.csv")
data.drop(['Unnamed: 0'], axis=1, inplace=True)
data.head(5)

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1903,The Great Train Robbery,American,Edwin S. Porter,,western,https://en.wikipedia.org/wiki/The_Great_Train_...,The film opens with two bandits breaking into ...
1,1908,The Call of the Wild,American,D. W. Griffith,Charles Inslee,adventure,https://en.wikipedia.org/wiki/The_Call_of_the_...,A white girl (Florence Lawrence) rejects a pro...
2,1908,The Fight for Freedom,American,D. W. Griffith,"Florence Auer, John G. Adolfi",western,https://en.wikipedia.org/wiki/The_Fight_for_Fr...,The film opens in a town on the Mexican border...
3,1912,Dr. Jekyll and Mr. Hyde,American,Lucius Henderson,James Cruze,horror,https://en.wikipedia.org/wiki/Dr._Jekyll_and_M...,White-haired Dr. Jekyll has secretly locked hi...
4,1913,Dr. Jekyll and Mr. Hyde,American,Herbert Brenon and Carl Laemmle,King Baggot,horror,https://en.wikipedia.org/wiki/Dr._Jekyll_and_M...,Dr. Henry Jekyll (King Baggot) sends a note to...


# PREPROCESSING

In [3]:
x = data['Plot']
y = data['Genre']

## Encoding Categorical Data

In [4]:
le =LabelEncoder()
y = le.fit_transform(y)

## Removing Stopwords

In [5]:
stop_words = set(stopwords.words('english'))
for i in range(len(x)):
    tokens = word_tokenize(x[i])
    filtered = [word for word in tokens if word not in stop_words]
    x[i] = ' '.join(filtered)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x[i] = ' '.join(filtered)


## Getting POS tag and lemmatization

In [6]:
def get_wordnet_pos(treebank_tag:str):
    if treebank_tag.startswith("N"):
        return wordnet.NOUN
    elif treebank_tag.startswith("J"):
        return wordnet.ADJ
    elif treebank_tag.startswith("V"):
        return wordnet.VERB
    elif treebank_tag.startswith("R"):
        return wordnet.ADV
    else:
        return wordnet.NOUN

class LemmTokenizer:
    def __init__(self):
        self.wnl = WordNetLemmatizer()

    def __call__(self, doc):
        tokens = word_tokenize(doc)
        tokens_tags = pos_tag(tokens)
        return [self.wnl.lemmatize(word, pos=get_wordnet_pos(tag)) for word, tag in tokens_tags]

## Train Test Split

In [7]:
x_train, x_test, y_train, y_test = train_test_split(x,y, shuffle=True, stratify=y, random_state=1, test_size=0.2)

## Vectorization

In [8]:
vectorizer = TfidfVectorizer(tokenizer=LemmTokenizer(), sublinear_tf=True)
x_train = vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test)



In [11]:
type(x_train), type(x_test), type(y_train), type(y_test)

(scipy.sparse._csr.csr_matrix,
 scipy.sparse._csr.csr_matrix,
 numpy.ndarray,
 numpy.ndarray)

# SAVING DATA

In [None]:
from scipy import sparse

sparse.save_npz("Data/xtrain.npz", x_train)
sparse.save_npz("Data/xtest.npz", x_test)
np.save("Data/ytrain.npy", y_train)
np.save("Data/ytest.npy", y_test)