# A Machine Learning Model That Can Predict The Genre Of A Movie
###  [Datasets🗂📝](https://www.kaggle.com/datasets/hijest/genre-classification-dataset-imdb)

In [194]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,precision_score
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn import set_config
set_config(display="diagram")
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import GridSearchCV

In [196]:
train_df = pd.read_csv("data/train_data.txt",sep=":::",header=None,engine="python")

In [198]:
train_df = train_df.drop(columns=[0],axis=1)
train_df = train_df.head(20)

In [199]:
train_df.head()

Unnamed: 0,1,2,3
0,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...
1,Cupid (1997),thriller,A brother and sister with a past incestuous r...
2,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...
3,The Secret Sin (1915),drama,To help their unemployed father make ends mee...
4,The Unrecovered (2007),drama,The film's title refers not only to the un-re...


In [201]:
train_df.rename(columns={1:"movie",2:"genre",3:"overview"},inplace=True)

In [202]:
train_df.describe()

Unnamed: 0,movie,genre,overview
count,20,20,20
unique,20,10,20
top,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...
freq,1,6,1


In [206]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie     20 non-null     object
 1   genre     20 non-null     object
 2   overview  20 non-null     object
dtypes: object(3)
memory usage: 612.0+ bytes


In [210]:
train_df.isna().sum()

movie       0
genre       0
overview    0
dtype: int64

In [212]:
train_df.duplicated().sum()

0

In [215]:
train_df.genre = train_df.genre.apply(lambda x:x.lower())
train_df.overview = train_df.overview.apply(lambda x:x.lower())

### Remove WhiteSpace

In [219]:
def remove_whitespace(text):
    return text.str.strip()

In [221]:
train_df = train_df.apply(remove_whitespace)

In [226]:
train_df.head(5)

Unnamed: 0,movie,genre,overview
0,Oscar et la dame rose (2009),drama,listening in to a conversation between his doc...
1,Cupid (1997),thriller,a brother and sister with a past incestuous re...
2,"Young, Wild and Wonderful (1980)",adult,as the bus empties the students for their fiel...
3,The Secret Sin (1915),drama,to help their unemployed father make ends meet...
4,The Unrecovered (2007),drama,the film's title refers not only to the un-rec...


### Remove Punctuation

In [229]:
def remove_punctuation(text):
    return text.str.replace('[^\w\s]', '', regex=True)


In [231]:
train_df = train_df.apply(remove_punctuation)

### Word Tokenize

In [235]:
def word_token(text):
    return text.apply(lambda x: x.split())

In [237]:
train_df = train_df.apply(word_token)

In [241]:
train_df.head(5)

Unnamed: 0,movie,genre,overview
0,"[Oscar, et, la, dame, rose, 2009]",[drama],"[listening, in, to, a, conversation, between, ..."
1,"[Cupid, 1997]",[thriller],"[a, brother, and, sister, with, a, past, inces..."
2,"[Young, Wild, and, Wonderful, 1980]",[adult],"[as, the, bus, empties, the, students, for, th..."
3,"[The, Secret, Sin, 1915]",[drama],"[to, help, their, unemployed, father, make, en..."
4,"[The, Unrecovered, 2007]",[drama],"[the, films, title, refers, not, only, to, the..."


In [243]:
train_df["text"] = train_df.overview + train_df.movie


In [245]:
train_df = train_df[["genre","text"]]
train_df.head()

Unnamed: 0,genre,text
0,[drama],"[listening, in, to, a, conversation, between, ..."
1,[thriller],"[a, brother, and, sister, with, a, past, inces..."
2,[adult],"[as, the, bus, empties, the, students, for, th..."
3,[drama],"[to, help, their, unemployed, father, make, en..."
4,[drama],"[the, films, title, refers, not, only, to, the..."


### Remove StopWords

In [248]:
def remove_stopwords(text):
    return text.apply(lambda x: [word for word in x if word not in stopwords.words("english")])

In [250]:
train_df = train_df.apply(remove_stopwords)

In [266]:
lb = LabelEncoder()
genre_df  = train_df.genre
genre_df= genre_df.apply(lambda x: ' '.join(x))

genre_df = lb.fit_transform(genre_df)

In [268]:
genre_df

array([5, 9, 0, 5, 5, 4, 2, 3, 7, 6, 4, 5, 4, 9, 5, 5, 2, 4, 8, 1])

In [270]:
ps = PorterStemmer()

In [272]:
def stemming(text):
    return text.apply(lambda x: [ps.stem(word) for word in x ]) 

In [274]:
train_df = train_df.apply(stemming)

In [296]:
df = pd.concat([train_df,pd.DataFrame(genre_df)],axis=1)
df.rename(columns={0:"genre_lb"},inplace=True)

Unnamed: 0,genre,text,genre_lb
0,[drama],"[listen, convers, doctor, parent, 10yearold, o...",5
1,[thriller],"[brother, sister, past, incestu, relationship,...",9
2,[adult],"[bu, empti, student, field, trip, museum, natu...",0
3,[drama],"[help, unemploy, father, make, end, meet, edit...",5
4,[drama],"[film, titl, refer, unrecov, bodi, ground, zer...",5


In [300]:
df['text'] = df['text'].apply(lambda x: ' '.join(x))
df['genre'] = df['genre'].apply(lambda x: ' '.join(x))

In [302]:
df.head()

Unnamed: 0,genre,text,genre_lb
0,drama,listen convers doctor parent 10yearold oscar l...,5
1,thriller,brother sister past incestu relationship curre...,9
2,adult,bu empti student field trip museum natur histo...,0
3,drama,help unemploy father make end meet edith twin ...,5
4,drama,film titl refer unrecov bodi ground zero also ...,5


### Export Clean Data

In [172]:
train_df.to_csv("data/clean_train_df.csv")

## Import Clean Data

In [175]:
new_train_df = pd.read_csv("data/clean_train_df.csv")

In [176]:
del new_train_df["Unnamed: 0"]

In [31]:
new_train_df.head(5)

Unnamed: 0,movie,genre,overview
0,Oscar et la dame rose (2009),8,listen convers doctor parent 10yearold oscar l...
1,Cupid (1997),24,brother sister past incestu relationship curre...
2,"Young, Wild and Wonderful (1980)",1,bu empti student field trip museum natur histo...
3,The Secret Sin (1915),8,help unemploy father make end meet edith twin ...
4,The Unrecovered (2007),8,film titl refer unrecov bodi ground zero also ...


### Vector Transformation

In [298]:
tfidf = TfidfVectorizer(max_features=6000)

In [304]:
x = tfidf.fit_transform(df.text).toarray()
x

array([[0.        , 0.12430076, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.10293862, 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.26239318],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [308]:
y = df.genre_lb.values
y

array([5, 9, 0, 5, 5, 4, 2, 3, 7, 6, 4, 5, 4, 9, 5, 5, 2, 4, 8, 1])

In [310]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [312]:
mnb = MultinomialNB()
mnb.fit(x_train,y_train)

In [314]:
y_pred = mnb.predict(x_test)
accuracy = accuracy_score(y_test,y_pred)
precision = precision_score(y_test,y_pred,average="macro")
print(f"accuracy {accuracy} , precision {precision}")
       

accuracy 0.5 , precision 0.16666666666666666


  _warn_prf(average, modifier, msg_start, len(result))


In [316]:
x = train_df.drop(columns=["genre","genre_lb"],axis=1)
y = train_df.genre_lb

KeyError: "['genre_lb'] not found in axis"

In [74]:
x.head()

Unnamed: 0,movie,overview
0,Oscar et la dame rose (2009),Listening in to a conversation between his do...
1,Cupid (1997),A brother and sister with a past incestuous r...
2,"Young, Wild and Wonderful (1980)",As the bus empties the students for their fie...
3,The Secret Sin (1915),To help their unemployed father make ends mee...
4,The Unrecovered (2007),The film's title refers not only to the un-re...


In [76]:
def remove_whitespace(text):
    return text.str.strip()

def remove_punctuation(text):
    return text.str.replace('[^\w\s]', '', regex=True)

def word_token(text):
    return text.apply(lambda x: x.split())
    
def remove_stopwords(text):
    return text.apply(lambda x: [word for word in x if word not in stopwords.words("english")])


ps = PorterStemmer()
def stemming(text):
    return text.apply(lambda x: [ps.stem(word) for word in x ]) 

def sparse_to_array(sparse_matrix):
    return sparse_matrix.toarray()

In [78]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.25,random_state=42)
y_train

14500    24
32204    17
27379     8
7238      7
42659     5
         ..
11284     5
44732     8
38158     8
860       8
15795     7
Name: genre_lb, Length: 40660, dtype: int32

## PipeLine

In [81]:
text_processing = Pipeline([
    ("remove_whitespace", FunctionTransformer(remove_whitespace, validate=False)),
    ('remove_punctuation', FunctionTransformer(remove_punctuation, validate=False)),
    ("word_token", FunctionTransformer(word_token, validate=False)),
    ("stopwords", FunctionTransformer(remove_stopwords, validate=False)),
    ("stemming", FunctionTransformer(stemming, validate=False)),
    ("TfidfVectorizer", TfidfVectorizer(max_features=6000)),
    ('toarray', FunctionTransformer(sparse_to_array, validate=False))  
])

tranform  = ColumnTransformer([
    ("vector", text_processing, "overview")
])

model = MultinomialNB()

# Apply the transformation to the data

In [83]:
pipe = make_pipeline(tranform, model)

In [85]:
pipe.fit(x_train,y_train)

AttributeError: 'list' object has no attribute 'lower'

In [232]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.naive_bayes import MultinomialNB

# Assume train_df is your DataFrame
# train_df = pd.read_csv('your_dataset.csv')

# Define your custom functions
def remove_punctuation(text):
    return text.str.replace('[^\w\s]', '', regex=True)

def word_token(text):
    return text.apply(lambda x: x.split())

def remove_stopwords(text):
    print()
    stopwords_list = set(['the', 'is', 'in', 'and', 'to', 'with'])
    return text.apply(lambda x: [word for word in x if word not in stopwords_list])

def stemming(text):
    from nltk.stem import PorterStemmer
    stemmer = PorterStemmer()
    return text.apply(lambda x: [stemmer.stem(word) for word in x])

def sparse_to_array(sparse_matrix):
    return sparse_matrix.toarray()

# Create individual pipelines for each text processing step
text_processing_pipeline = Pipeline([
    ('remove_punctuation', FunctionTransformer(remove_punctuation, validate=False)),
    ('word_token', FunctionTransformer(word_token, validate=False)),
    ('remove_stopwords', FunctionTransformer(remove_stopwords, validate=False)),
    ('stemming', FunctionTransformer(stemming, validate=False)),
])

# Combine the text processing steps and apply TF-IDF
full_pipeline = Pipeline([
    ('text_processing', text_processing_pipeline),
    ('tfidf', TfidfVectorizer(max_features=6000, tokenizer=lambda x: x, preprocessor=lambda x: x)),
    ('toarray', FunctionTransformer(sparse_to_array, validate=False))  # Convert sparse matrix to array
])

# Define the ColumnTransformer to apply the full pipeline to the text column
preprocessor = ColumnTransformer([
    ('tfidf_pipeline', full_pipeline, 'overview')
])

# Combine the preprocessor with the classifier
model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', MultinomialNB())
])

# Fit the pipeline to the training data
model_pipeline.fit(train_df[['overview']], train_df['genre'])


47289    [In, Brétema, on, the, Atlantic, coast, there,...
11145    [A, secret, recipe, book, for, making, great, ...
446      [SADDAM, is, the, story, of, two, men, Antonio...
11315    [Traveling, is, the, coming, of, adult, age, s...
12851    [A, popular, teenage, girl, Holly, struggles, ...
11483    [Most, of, the, trade, reviewers, of, the, tim...
25764    [Cherry, Tobacco, is, about, a, smalltown, gir...
47961    [With, raven, gone, and, the, horse, thief, un...
28192    [Gálfy, left, remarkable, traces, behind, wher...
25621    [The, third, and, final, installment, in, the,...
50447    [Rosa, 23, years, old, works, as, a, maid, for...
23203    [On, The, Hush, Films, brings, the, most, cont...
5948     [Marcus, doesnt, trust, what, Lucien, said, ab...
48391    [This, is, docufiction, the, story, of, Indian...
53857    [Last, Saturday, HannoverLinden, presented, hi...
51008    [Amber, Tamra, and, Dave, make, up, the, indep...
44006    [In, an, homage, to, Alfred, Hitchcocks, Verti.

