# Imports and data loading

In [1]:
import sys
sys.path.append('..')

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
#from src.features import clean_text
from src.data.text_processing import clean_text, filter_rare, replace_labels, lemming, remove_stopwords

In [2]:
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('corpora/wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Error loading corpora/wordnet: Package 'corpora/wordnet'
[nltk_data]     not found in index


False

In [3]:
df = pd.read_csv('../data/raw/wiki_movie_plots_deduped.csv')

# Initial analysis

In [4]:
df.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr..."
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov..."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed..."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...


In [47]:
df.shape

(34886, 8)

In [48]:
df.dtypes

Release Year         int64
Title               object
Origin/Ethnicity    object
Director            object
Cast                object
Genre               object
Wiki Page           object
Plot                object
dtype: object

In [49]:
df['Release Year'] = df['Release Year'].astype(int)

In [50]:
df.isna().sum()

Release Year           0
Title                  0
Origin/Ethnicity       0
Director               0
Cast                1422
Genre                  0
Wiki Page              0
Plot                   0
dtype: int64

In [51]:
df['Genre'].value_counts()

unknown                          6083
drama                            5964
comedy                           4379
horror                           1167
action                           1098
                                 ... 
cbc-tv miniseries                   1
bio-drama                           1
national film board docudrama       1
cult drama                          1
horror romantic comedy              1
Name: Genre, Length: 2265, dtype: int64

It seems like there are quite a lot of very rare classes. and many of them are a combination of other like horror romantic comedy

In [52]:
len(df['Genre'].unique())/len(df)

0.06492575818379866

In [53]:
df['Genre'].value_counts()[df['Genre'].value_counts() > 10].sum()/len(df)

0.8980679928911312

10% of dataset has very rare classes.

In [54]:
df['Genre'].value_counts()[df['Genre'].value_counts() < 10]

adventure, fantasy               9
action-adventure, fantasy        9
political drama                  9
musical, comedy                  9
comedy, drama, romance           9
                                ..
cbc-tv miniseries                1
bio-drama                        1
national film board docudrama    1
cult drama                       1
horror romantic comedy           1
Name: Genre, Length: 2093, dtype: int64

# Preprocessing and cleaning

In [56]:
df['Genre'] = df['Genre'].apply(clean_text)

In [57]:
df['Genre'] = df['Genre'].apply(lambda x: 'unknown' if x == '' else x)

In [58]:
df['Genre'].value_counts()[df['Genre'].value_counts() < 10]

biographical drama                                  9
action adventure fantasy                            9
science fiction thriller                            9
thriller romance                                    9
animation comedy                                    9
                                                   ..
historical biodrama on the early years of hitler    1
short black comedy                                  1
dance film horror from the novel by bram stoker     1
epic drama set years ago in the canadian arctic     1
adapted from the play by alexandre goyette          1
Name: Genre, Length: 1635, dtype: int64

There are already much fewer unfrequent classes - 1635 vs 2093

In [59]:
df['Genre'] = df['Genre'].apply(lambda x: x.split())

Perhaps I could somehow choose some numer of most frequent classes and assign those classes to the rare classes that contain the frequent ones. But I don't know and for now I'll just drop the rare ones.

In [60]:
df['Genre'].value_counts()[df['Genre'].value_counts() > 10]

[unknown]                    6116
[drama]                      5991
[comedy]                     4398
[horror]                     1172
[action]                     1121
                             ... 
[fantasy, romance]             11
[disaster, film]               11
[horror, romance, comedy]      11
[comedy, sports]               11
[screwball, comedy]            11
Name: Genre, Length: 144, dtype: int64

In [61]:
df['Genre'][df['Genre'].apply(lambda x: len(x)>1)].value_counts()

[comedy, drama]                                         556
[crime, drama]                                          499
[romantic, comedy]                                      479
[science, fiction]                                      433
[film, noir]                                            345
                                                       ... 
[tv, miniseries, docudrama]                               1
[religious, drama]                                        1
[drama, based, on, the, novel, by, rohinton, mistry]      1
[drama, based, on, the, novel, by, russell, banks]        1
[adventure, romance, fantasy, film]                       1
Name: Genre, Length: 1651, dtype: int64

In [62]:
df['Genre'].value_counts()[df['Genre'].value_counts() >= 10].sum()/len(df)

0.9203405377515336

In [63]:
df['Genre'].value_counts()[df['Genre'].value_counts() < 10].sum()/len(df)

0.07965946224846643

## Filtering data to get rid of 'unknown' and rare classes

In [66]:
df_filtered = filter_rare(df)

In [67]:
df_filtered['Genre'] = df_filtered['Genre'].apply(lambda x: ' '.join(x))

In [67]:
df_filtered = df_filtered[df_filtered['Genre'] != 'unknown']
df_filtered = df_filtered.reset_index()

In [68]:
df_filtered['Genre'].value_counts()

drama                     5991
comedy                    4398
horror                    1172
action                    1121
thriller                   984
                          ... 
crime action                10
adventure fantasy           10
comedy science fiction      10
epic                        10
disaster                    10
Name: Genre, Length: 150, dtype: int64

In [70]:
replacements = [
    ('animated', 'animation'),
    ('biography', 'biographical'),
    ('biopic', 'biographical'),
    ('com', 'com'),
    ('com', 'comedy'),
    ('docudrama', 'documentary drama'),
    ('dramedy', 'drama comedy'),
    ('sci fi', 'sci_fi'),
    ('science fiction', 'sci_fi'),
    ('film', ''),
    ('world war ii', 'world_ii war'),
    ('rom ', 'romantic '),
    ('romance', 'romantic'),
    ('comedyedy', 'comedy')
]

In [73]:
df_filtered['Genre'] = df_filtered['Genre'].apply(lambda x: replace_labels(x, replacements))

In [None]:
df_filtered.to_csv('../data/interim/data_intermidiate')

# Text processing

For all the models other then LLM I'll perform lemmatizing and removing stop words.

In [74]:
en_stopwords = stopwords.words('english')

In [None]:
df_filtered_processed =df_filtered.copy()

In [76]:
df_filtered_processed['Plot'] = df_filtered_processed['Plot'].apply(clean_text)
df_filtered_processed['Plot'] = df_filtered_processed['Plot'].apply(lambda X: word_tokenize(X))
df_filtered_processed['Plot']=df_filtered_processed['Plot'].apply(lambda x: remove_stopwords(x, en_stopwords))


In [77]:
df_filtered_processed.to_csv('../data/processed/data_processed_no_lemming.csv')

In [None]:
df_filtered_processed['Plot']=df_filtered_processed['Plot'].apply(lemming)
df_filtered_processed.to_csv('../data/processed/data_processed.csv')

In [None]:
labels = df_filtered.Genre.unique()
num_labels = len(labels)
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

In [None]:
label2id

{'western': '0',
 'comedy': '1',
 'short': '2',
 'biographical': '3',
 'drama': '4',
 'adventure': '5',
 'horror': '6',
 'crime': '7',
 'drama horror': '8',
 'historical drama': '9',
 'fantasy': '10',
 'epic': '11',
 'historical': '12',
 'comedy short': '13',
 'comedy western': '14',
 'action adventure': '15',
 'romantic drama': '16',
 'mystery': '17',
 'crime drama': '18',
 'romantic': '19',
 'comedy drama': '20',
 'war drama': '21',
 'spy': '22',
 'romantic comedy': '23',
 'propaganda': '24',
 'drama romantic': '25',
 'melodrama': '26',
 'period drama': '27',
 'swashbuckler': '28',
 'drama adventure': '29',
 'crime comedy': '30',
 'documentary': '31',
 'war': '32',
 'fantasy adventure': '33',
 'thriller': '34',
 'mystery thriller': '35',
 'crime thriller': '36',
 'fantasy romantic': '37',
 'comedy romantic': '38',
 'musical': '39',
 'musical comedy': '40',
 'comedy mystery': '41',
 'drama war': '42',
 'horror comedy': '43',
 'drama crime': '44',
 'costume drama': '45',
 'action': '46

There are some classes that are very simmilar - animated and animation, 'biographical', 'biography', 'biopic'; com and comedy, docudrama can be split into documentary and drama, dramedy - to drama and comedy, sci and fi sjhould be merged to sci-fi, as well as science fiction, 'rom', 'romance', 'romantic' should be merged to romantic, world and ii should be merged to 'world war ii' as this is the only case thay appear.