## Feature Engineering

In [32]:
import pandas as pd
import nltk

In [33]:
df = pd.read_pickle('clean_dataset.pkl')
df.head(3)

Unnamed: 0,imdbID,title,year,genre,director,cast,imdbRating,fullplot,language,country,type
2008,35423,Kate & Leopold,2001.0,"Comedy, Fantasy, Romance",James Mangold,"Meg Ryan, Hugh Jackman, Liev Schreiber, Brecki...",6.3,Kate and her actor brother live in N.Y. in the...,"English, French",USA,movie
7011,64994,Larks on a String,1990.0,"Comedy, Drama, Romance",Jir� Menzel,"Rudolf Hrus�nsk�, Vlastimil Brodsk�, V�clav Ne...",7.6,Prague in the early 1950's. Bourgeois elements...,Czech,Czechoslovakia,movie
10044,81145,Me and the Kid,1993.0,"Comedy, Crime, Drama",Dan Curtis,"Danny Aiello, Alex Zuckerman, Joe Pantoliano, ...",5.3,"Two ex-cons, Harry (Aiello) and Roy (Pantolian...",English,USA,movie


In [34]:
#convert the year column into decade 

df['year'] = pd.to_numeric(df['year'], errors='coerce')
df.dropna(subset=['year'], inplace=True)
df['decade'] = df['year'].apply(lambda x: x - x % 10)

# Drop the 'year' column
df.drop(['year'], axis=1, inplace=True)

In [35]:
# converting all the column values into list
def split_strip(row):
    new_row = []
    row = str(row)
    row = row.split(',')
    for i in row:
        i = i.strip()
        i = i.lower()
        new_row.append(i)

    return new_row


In [36]:
for i in df.columns:
    if i not in ['imdbID','title']:
        
        df[i] = df[i].apply(split_strip)

In [37]:
df.head(3)

Unnamed: 0,imdbID,title,genre,director,cast,imdbRating,fullplot,language,country,type,decade
2008,35423,Kate & Leopold,"[comedy, fantasy, romance]",[james mangold],"[meg ryan, hugh jackman, liev schreiber, breck...",[6.3],[kate and her actor brother live in n.y. in th...,"[english, french]",[usa],[movie],[2000.0]
7011,64994,Larks on a String,"[comedy, drama, romance]",[jir� menzel],"[rudolf hrus�nsk�, vlastimil brodsk�, v�clav n...",[7.6],[prague in the early 1950's. bourgeois element...,[czech],[czechoslovakia],[movie],[1990.0]
10044,81145,Me and the Kid,"[comedy, crime, drama]",[dan curtis],"[danny aiello, alex zuckerman, joe pantoliano,...",[5.3],"[two ex-cons, harry (aiello) and roy (pantolia...",[english],[usa],[movie],[1990.0]


In [38]:
## removing the inbetween spacas from director and cast names
def space_remover(row):
    new_row = []
    for i in row:
        i = i.replace(' ','')
        new_row.append(i)
    return new_row

df['director'] = df['director'].apply(space_remover)
df['cast'] = df['cast'].apply(space_remover)

In [40]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string




def preprocess_sentences(sentences):
    
    sentences = [''.join([char for char in sentence if char not in string.punctuation]) for sentence in sentences]
    
    
    tokenized_sentences = [nltk.word_tokenize(sentence.lower()) for sentence in sentences]
    
    
    english_stopwords = set(stopwords.words('english'))
    
    
    processed_sentences = [[word for word in words if word not in english_stopwords ] for words in tokenized_sentences]
    
    
    stemmer = PorterStemmer()
    processed_sentences = [[stemmer.stem(word) for word in words] for words in processed_sentences]
    
    return processed_sentences

df['fullplot'] = df['fullplot'].apply(preprocess_sentences)
df['fullplot'] = df['fullplot'].apply(lambda x: x[0])

In [41]:
df.head(3)

Unnamed: 0,imdbID,title,genre,director,cast,imdbRating,fullplot,language,country,type,decade
2008,35423,Kate & Leopold,"[comedy, fantasy, romance]",[jamesmangold],"[megryan, hughjackman, lievschreiber, breckinm...",[6.3],"[kate, actor, brother, live, ny, 21st, centuri...","[english, french]",[usa],[movie],[2000.0]
7011,64994,Larks on a String,"[comedy, drama, romance]",[jir�menzel],"[rudolfhrus�nsk�, vlastimilbrodsk�, v�clavneck...",[7.6],"[pragu, earli, 1950, bourgeoi, element, reeduc...",[czech],[czechoslovakia],[movie],[1990.0]
10044,81145,Me and the Kid,"[comedy, crime, drama]",[dancurtis],"[dannyaiello, alexzuckerman, joepantoliano, ca...",[5.3],"[two, excon]",[english],[usa],[movie],[1990.0]


In [42]:
## Joining the column together 

df['meta_tags'] = df['genre'] + df['director'] + df['cast'] + df['imdbRating'] + df['language'] + df['country'] + df['type'] +df['decade'] 
df['all tags']  = df['fullplot'] + df['meta_tags']


In [43]:

df.drop(['genre','director','cast','imdbRating','language','country','type','decade'],axis=1,inplace=True)

In [44]:
df['fullplot'] = df['fullplot'].apply(lambda x: ' '.join(x))
df['meta_tags'] = df['meta_tags'].apply(lambda x: ' '.join(x))
df['all tags'] = df['all tags'].apply(lambda x: ' '.join(x))

In [45]:
df.head(3)

Unnamed: 0,imdbID,title,fullplot,meta_tags,all tags
2008,35423,Kate & Leopold,kate actor brother live ny 21st centuri exboyf...,comedy fantasy romance jamesmangold megryan hu...,kate actor brother live ny 21st centuri exboyf...
7011,64994,Larks on a String,pragu earli 1950 bourgeoi element reeduc work ...,comedy drama romance jir�menzel rudolfhrus�nsk...,pragu earli 1950 bourgeoi element reeduc work ...
10044,81145,Me and the Kid,two excon,comedy crime drama dancurtis dannyaiello alexz...,two excon comedy crime drama dancurtis dannyai...


In [46]:
df.to_pickle("preprocessed_dataset.pkl")