# A Machine Learning Model That Can Predict The Genre Of A Movie
###  [Datasets🗂📝](https://www.kaggle.com/datasets/hijest/genre-classification-dataset-imdb)

In [493]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,precision_score
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn import set_config
set_config(display="diagram")
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [495]:
train_df = pd.read_csv("data/train_data.txt",sep=":::",header=None,engine="python")

In [496]:
train_df = train_df.drop(columns=[0],axis=1)

In [497]:
train_df.head()

Unnamed: 0,1,2,3
0,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...
1,Cupid (1997),thriller,A brother and sister with a past incestuous r...
2,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...
3,The Secret Sin (1915),drama,To help their unemployed father make ends mee...
4,The Unrecovered (2007),drama,The film's title refers not only to the un-re...


In [498]:
train_df.rename(columns={1:"movie",2:"genre",3:"overview"},inplace=True)

In [502]:
train_df.describe()

Unnamed: 0,movie,genre,overview
count,54214,54214,54214
unique,54214,27,54086
top,Oscar et la dame rose (2009),drama,Grammy - music award of the American academy ...
freq,1,13613,12


In [507]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54214 entries, 0 to 54213
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie     54214 non-null  object
 1   genre     54214 non-null  object
 2   overview  54214 non-null  object
dtypes: object(3)
memory usage: 1.2+ MB


In [509]:
train_df.isna().sum()

movie       0
genre       0
overview    0
dtype: int64

In [10]:
train_df.duplicated().sum()

0

In [511]:
train_df.genre = train_df.genre.apply(lambda x:x.lower())
train_df.overview = train_df.overview.apply(lambda x:x.lower())

In [515]:
train_df = df_pipe
train_df

Unnamed: 0,genre,text
0,drama,Listening in to a conversation between his do...
3,drama,To help their unemployed father make ends mee...
4,drama,The film's title refers not only to the un-re...
11,drama,"Danny, dying of Aids, returns home for his la..."
14,drama,Amanda Wingfield dominates her children with ...
...,...,...
54189,romance,Mary-Kate and Ashley have boy trouble: they d...
54203,fantasy,100 CUPBOARDS chronicles the fantastical jour...
54206,music,"In November 2011, The Devin Townsend Project ..."
54208,family,"First story, ""Temptation"": Two kids find a wa..."


### Remove WhiteSpace

In [518]:
def remove_whitespace(text):
    return text.str.strip()

In [520]:
train_df = train_df.apply(remove_whitespace)

In [522]:
train_df.head(5)

Unnamed: 0,genre,text
0,drama,Listening in to a conversation between his doc...
3,drama,To help their unemployed father make ends meet...
4,drama,The film's title refers not only to the un-rec...
11,drama,"Danny, dying of Aids, returns home for his las..."
14,drama,Amanda Wingfield dominates her children with h...


### Remove Punctuation

In [525]:
def remove_punctuation(text):
    return text.str.replace('[^\w\s]', '', regex=True)


In [527]:
train_df = train_df.apply(remove_punctuation)

### Word Tokenize

In [530]:
def word_token(text):
    return text.apply(lambda x: x.split())

In [532]:
train_df = train_df.apply(word_token)

In [534]:
train_df.head(5)

Unnamed: 0,genre,text
0,[drama],"[Listening, in, to, a, conversation, between, ..."
3,[drama],"[To, help, their, unemployed, father, make, en..."
4,[drama],"[The, films, title, refers, not, only, to, the..."
11,[drama],"[Danny, dying, of, Aids, returns, home, for, h..."
14,[drama],"[Amanda, Wingfield, dominates, her, children, ..."


In [536]:
train_df["text"] = train_df.overview + train_df.movie


AttributeError: 'DataFrame' object has no attribute 'overview'

In [24]:
train_df = train_df[["genre","text"]]
train_df.head()

Unnamed: 0,genre,text
0,[drama],"[listening, in, to, a, conversation, between, ..."
1,[thriller],"[a, brother, and, sister, with, a, past, inces..."
2,[adult],"[as, the, bus, empties, the, students, for, th..."
3,[drama],"[to, help, their, unemployed, father, make, en..."
4,[drama],"[the, films, title, refers, not, only, to, the..."


### Remove StopWords

In [538]:
def remove_stopwords(text):
    return text.apply(lambda x: [word for word in x if word not in stopwords.words("english")])

In [540]:
train_df = train_df.apply(remove_stopwords)

KeyboardInterrupt: 

In [None]:
lb = LabelEncoder()
genre_df  = train_df.genre
genre_df= genre_df.apply(lambda x: ' '.join(x))

genre_df = lb.fit_transform(genre_df)

In [None]:
genre_df

In [None]:
ps = PorterStemmer()

### Stemming

In [None]:
def stemming(text):
    return text.apply(lambda x: [ps.stem(word) for word in x ]) 

In [None]:
train_df = train_df.apply(stemming)

In [None]:
df = pd.concat([train_df,pd.DataFrame(genre_df)],axis=1)
df.rename(columns={0:"genre_lb"},inplace=True)

In [34]:
df['text'] = df['text'].apply(lambda x: ' '.join(x))
df['genre'] = df['genre'].apply(lambda x: ' '.join(x))

In [35]:
df.head()

Unnamed: 0,genre,text,genre_lb
0,drama,listen convers doctor parent 10yearold oscar l...,8
1,thriller,brother sister past incestu relationship curre...,24
2,adult,bu empti student field trip museum natur histo...,1
3,drama,help unemploy father make end meet edith twin ...,8
4,drama,film titl refer unrecov bodi ground zero also ...,8


### Export Clean Data

In [50]:
# df.to_csv("data/clean_train_df.csv")

## Import Clean Data

In [244]:
new_train_df = pd.read_csv("data/clean_train_df.csv")

In [245]:
del new_train_df["Unnamed: 0"]

In [248]:
new_train_df.head(5)

Unnamed: 0,genre,text,genre_lb
0,drama,listen convers doctor parent 10yearold oscar l...,8
1,thriller,brother sister past incestu relationship curre...,24
2,adult,bu empti student field trip museum natur histo...,1
3,drama,help unemploy father make end meet edith twin ...,8
4,drama,film titl refer unrecov bodi ground zero also ...,8


### Vector Transformation

In [251]:
tfidf = TfidfVectorizer(max_features=6000)

In [253]:
x = tfidf.fit_transform(new_train_df.text).toarray()
x

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [263]:
y = new_train_df.genre_lb.values
y

array([ 8, 24,  1, ...,  7,  5, 12], dtype=int64)

In [265]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [266]:
mnb = MultinomialNB()
mnb.fit(x_train,y_train)

In [268]:
y_pred = mnb.predict(x_test)
accuracy = accuracy_score(y_test,y_pred)
precision = precision_score(y_test,y_pred,average="macro")
print(f"accuracy {accuracy} , precision {precision}")
       

accuracy 0.49884718251406435 , precision 0.44762048726543713


  _warn_prf(average, modifier, msg_start, len(result))


In [107]:
df_pipe = pd.read_csv("data/train_data.txt",sep=":::",header=None,engine="python")

In [108]:
df_pipe.rename(columns={1:"movie",2:"genre",3:"overview"},inplace=True)

In [109]:
del df_pipe[0]

In [110]:
df_pipe["text"] = df_pipe.overview + df_pipe.movie
df_pipe=df_pipe[["genre","text"]]

In [459]:
drama = df_pipe[df_pipe.genre == " drama "][:500]
documentary =  df_pipe[df_pipe.genre ==" documentary "][:500]
comedy = df_pipe[df_pipe.genre ==" comedy "][:500]
short = df_pipe[df_pipe.genre ==" short "][:500]
thriller = df_pipe[df_pipe.genre ==" thriller "][:500]
action = df_pipe[df_pipe.genre ==" action "][:500]
horror = df_pipe[df_pipe.genre ==" horror "][:500]
western = df_pipe[df_pipe.genre ==" western "][:400]

In [461]:
df_ =df_pipe[~df_pipe.genre.isin([" horror "," drama ", " documentary ", " comedy ", " short ", " thriller ", " action ", " western "])]

In [463]:
df_pipe = pd.concat([drama,documentary,comedy,short,thriller,action,horror,western,df_])

In [465]:
x_ = df_pipe.drop(columns=["genre"])
y_ = df_pipe.genre

In [467]:
x_.head()

Unnamed: 0,text
0,Listening in to a conversation between his do...
3,To help their unemployed father make ends mee...
4,The film's title refers not only to the un-re...
11,"Danny, dying of Aids, returns home for his la..."
14,Amanda Wingfield dominates her children with ...


In [469]:
from scipy.sparse import csr_matrix

def sparse_to_csr(sparse_matrix):
    if isinstance(sparse_matrix, csr_matrix):
        return sparse_matrix
    return csr_matrix(sparse_matrix)

In [473]:
x_train,x_test,y_train,y_test = train_test_split(x_,y_,test_size=0.1,random_state=4)
x_test.shape,x_train.shape

((1275, 1), (11468, 1))

## PipeLine

In [476]:
def string_fun(text):
    return text.astype(str).agg(''.join, axis=0)

In [478]:
text_processing = Pipeline([
    ("remove_whitespace", FunctionTransformer(remove_whitespace, validate=False)),
    ('remove_punctuation', FunctionTransformer(remove_punctuation, validate=False)),
    ("word_token", FunctionTransformer(word_token, validate=False)),
    ("stopwords", FunctionTransformer(remove_stopwords, validate=False)),
    ("stemming", FunctionTransformer(stemming, validate=False)),
    ("string",FunctionTransformer(string_fun,validate=False)),
    ("TfidfVectorizer", TfidfVectorizer()),
    ('toarray', FunctionTransformer(sparse_to_csr, validate=False))  
])

tranform  = ColumnTransformer([
    ("vector", text_processing, "text")
])

model = MultinomialNB()

# Apply the transformation to the data

In [480]:
pipe = make_pipeline(tranform, model)

In [482]:
pipe.fit(x_train,y_train)

In [557]:
import pickle
import dill

In [559]:
#pickle.dump(pipe,open("pipe.pkl","wb"))
dill.dump(pipe,open("pipe1.pkl","wb"))


In [485]:
accuracy_score(y_test, pipe.predict(x_test))

0.30823529411764705

In [491]:
precision_score(y_test, pipe.predict(x_test),average="micro")

0.30823529411764705

In [None]:
y_test.value_counts()