# A Machine Learning Model That Can Predict The Genre Of A Movie
###  [Datasets🗂📝](https://www.kaggle.com/datasets/hijest/genre-classification-dataset-imdb)

In [61]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB,MultinomialNB
from sklearn.metrics import accuracy_score,precision_score

In [109]:
train_df = pd.read_csv("data/train_data.txt",sep=":::",header=None,engine="python")

In [110]:
test_df = pd.read_csv("data/test_data.txt",sep=":::",header=None,engine="python")

In [111]:
train_df = train_df.drop(columns=[0],axis=1)
train_df

Unnamed: 0,1,2,3
0,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...
1,Cupid (1997),thriller,A brother and sister with a past incestuous r...
2,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...
3,The Secret Sin (1915),drama,To help their unemployed father make ends mee...
4,The Unrecovered (2007),drama,The film's title refers not only to the un-re...
...,...,...,...
54209,"""Bonino"" (1953)",comedy,This short-lived NBC live sitcom centered on ...
54210,Dead Girls Don't Cry (????),horror,The NEXT Generation of EXPLOITATION. The sist...
54211,Ronald Goedemondt: Ze bestaan echt (2008),documentary,"Ze bestaan echt, is a stand-up comedy about g..."
54212,Make Your Own Bed (1944),comedy,Walter and Vivian live in the country and hav...


In [112]:
test_df = test_df.drop(columns=[0],axis=1)
test_df

Unnamed: 0,1,2
0,Edgar's Lunch (1998),"L.R. Brane loves his life - his car, his apar..."
1,La guerra de papá (1977),"Spain, March 1964: Quico is a very naughty ch..."
2,Off the Beaten Track (2010),One year in the life of Albin and his family ...
3,Meu Amigo Hindu (2015),"His father has died, he hasn't spoken with hi..."
4,Er nu zhai (1955),Before he was known internationally as a mart...
...,...,...
54195,"""Tales of Light & Dark"" (2013)","Covering multiple genres, Tales of Light & Da..."
54196,Der letzte Mohikaner (1965),As Alice and Cora Munro attempt to find their...
54197,Oliver Twink (2007),A movie 169 years in the making. Oliver Twist...
54198,Slipstream (1973),"Popular, but mysterious rock D.J Mike Mallard..."


In [114]:
train_df.rename(columns={1:"movie",2:"genre",3:"overview"},inplace=True)
test_df.rename(columns={1:"movie",2:"overview"},inplace=True)

In [119]:
train_df.describe()

Unnamed: 0,movie,genre,overview
count,54214,54214,54214
unique,54214,27,54086
top,Oscar et la dame rose (2009),drama,Grammy - music award of the American academy ...
freq,1,13613,12


In [121]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54214 entries, 0 to 54213
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie     54214 non-null  object
 1   genre     54214 non-null  object
 2   overview  54214 non-null  object
dtypes: object(3)
memory usage: 1.2+ MB


In [123]:
train_df.isna().sum()

movie       0
genre       0
overview    0
dtype: int64

In [125]:
train_df.duplicated().sum()

0

In [127]:
train_df.genre[1]

' thriller '

### Remove WhiteSpace

In [130]:
def remove_whitespace(col):
    return col.strip()

In [132]:
train_df.genre = train_df.genre.apply(remove_whitespace)
train_df.movie = train_df.movie.apply(remove_whitespace)
train_df.overview = train_df.overview.apply(remove_whitespace)

In [134]:
test_df.movie = test_df.movie.apply(remove_whitespace)
test_df.overview = test_df.overview.apply(remove_whitespace)

In [140]:
lb = LabelEncoder()

In [142]:
train_df.genre = lb.fit_transform(train_df.genre)
train_df.head(5)

Unnamed: 0,movie,genre,overview
0,Oscar et la dame rose (2009),8,Listening in to a conversation between his doc...
1,Cupid (1997),24,A brother and sister with a past incestuous re...
2,"Young, Wild and Wonderful (1980)",1,As the bus empties the students for their fiel...
3,The Secret Sin (1915),8,To help their unemployed father make ends meet...
4,The Unrecovered (2007),8,The film's title refers not only to the un-rec...


### Remove Punctuation

In [149]:
def remove_punctuation(col):

    text = "".joi[word for word in col if word not in string.punctuation]
    return text
    col = col.lower()
    for i in string.punctuation:
        col = col.replace(i,"")
    return col

In [151]:
train_df.overview.apply(remove_punctuation)

0        [L, i, s, t, e, n, i, n, g,  , i, n,  , t, o, ...
1        [A,  , b, r, o, t, h, e, r,  , a, n, d,  , s, ...
2        [A, s,  , t, h, e,  , b, u, s,  , e, m, p, t, ...
3        [T, o,  , h, e, l, p,  , t, h, e, i, r,  , u, ...
4        [T, h, e,  , f, i, l, m, s,  , t, i, t, l, e, ...
                               ...                        
54209    [T, h, i, s,  , s, h, o, r, t, l, i, v, e, d, ...
54210    [T, h, e,  , N, E, X, T,  , G, e, n, e, r, a, ...
54211    [Z, e,  , b, e, s, t, a, a, n,  , e, c, h, t, ...
54212    [W, a, l, t, e, r,  , a, n, d,  , V, i, v, i, ...
54213    [O, n,  , L, a, b, o, r,  , D, a, y,  , W, e, ...
Name: overview, Length: 54214, dtype: object

In [28]:
train_df.overview[1]

'a brother and sister with a past incestuous relationship have a current murderous relationship he murders the women who reject him and she murders the women who get too close to him'

In [29]:
test_df.overview = test_df.overview.apply(remove_punctuation)

# word tokenize

In [31]:
from nltk.tokenize import word_tokenize

In [32]:
def word_token(col):
    return word_tokenize(col)

In [33]:
train_df.overview = train_df.overview.apply(word_token)

In [34]:
test_df.overview = test_df.overview.apply(word_token)

In [35]:
train_df.head(5)

Unnamed: 0,movie,genre,overview
0,Oscar et la dame rose (2009),8,"[listening, in, to, a, conversation, between, ..."
1,Cupid (1997),24,"[a, brother, and, sister, with, a, past, inces..."
2,"Young, Wild and Wonderful (1980)",1,"[as, the, bus, empties, the, students, for, th..."
3,The Secret Sin (1915),8,"[to, help, their, unemployed, father, make, en..."
4,The Unrecovered (2007),8,"[the, films, title, refers, not, only, to, the..."


In [36]:
def remove_stopwords(col):
    l = []
    for i in col:
        if i not in stopwords.words("english"):
          l.append(i)
    return l

In [37]:
train_df.overview = train_df.overview.apply(remove_stopwords)

In [38]:
test_df.overview = test_df.overview.apply(remove_stopwords)


In [39]:
from nltk.stem import PorterStemmer
ps= PorterStemmer()

In [44]:
def stemming(col):
    l =[]
    for i in col:
        l.append(ps.stem(i))
    return " ".join(l)

In [46]:
train_df.head()

Unnamed: 0,movie,genre,overview
0,Oscar et la dame rose (2009),8,"[listening, conversation, doctor, parents, 10y..."
1,Cupid (1997),24,"[brother, sister, past, incestuous, relationsh..."
2,"Young, Wild and Wonderful (1980)",1,"[bus, empties, students, field, trip, museum, ..."
3,The Secret Sin (1915),8,"[help, unemployed, father, make, ends, meet, e..."
4,The Unrecovered (2007),8,"[films, title, refers, unrecovered, bodies, gr..."


In [48]:
train_df.overview = train_df.overview.apply(stemming)
test_df.overview = test_df.overview.apply(stemming)



In [49]:
#train_df.to_csv("data/train_df.csv")

In [50]:
train_df

Unnamed: 0,movie,genre,overview
0,Oscar et la dame rose (2009),8,listen convers doctor parent 10yearold oscar l...
1,Cupid (1997),24,brother sister past incestu relationship curre...
2,"Young, Wild and Wonderful (1980)",1,bu empti student field trip museum natur histo...
3,The Secret Sin (1915),8,help unemploy father make end meet edith twin ...
4,The Unrecovered (2007),8,film titl refer unrecov bodi ground zero also ...
...,...,...,...
54209,"""Bonino"" (1953)",5,shortliv nbc live sitcom center bonino worldfa...
54210,Dead Girls Don't Cry (????),13,next gener exploit sister kapa bay soror hous ...
54211,Ronald Goedemondt: Ze bestaan echt (2008),7,ze bestaan echt standup comedi grow face fear ...
54212,Make Your Own Bed (1944),5,walter vivian live countri difficult time keep...


In [51]:
test_df.to_csv("data/clean_test_df.csv")

In [52]:
train_df.to_csv("data/clean_train_df.csv")

In [54]:
train_df = pd.read_csv("data/clean_train_df.csv")
test_df= pd.read_csv("data/clean_test_df.csv")

In [55]:
del train_df["Unnamed: 0"]

In [56]:
del test_df["Unnamed: 0"]

In [57]:
train_df.head(5)

Unnamed: 0,movie,genre,overview
0,Oscar et la dame rose (2009),8,listen convers doctor parent 10yearold oscar l...
1,Cupid (1997),24,brother sister past incestu relationship curre...
2,"Young, Wild and Wonderful (1980)",1,bu empti student field trip museum natur histo...
3,The Secret Sin (1915),8,help unemploy father make end meet edith twin ...
4,The Unrecovered (2007),8,film titl refer unrecov bodi ground zero also ...


In [61]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB,MultinomialNB
from sklearn.metrics import accuracy_score,precision_score

In [62]:
tfidf = TfidfVectorizer(max_features=6000)

In [63]:
x = tfidf.fit_transform(train_df.overview).toarray()
x

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [64]:
y = train_df.genre.values
y

array([ 8, 24,  1, ...,  7,  5, 12], dtype=int64)

In [65]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [66]:
mnb = MultinomialNB()
gnb = GaussianNB()

In [67]:
mnb.fit(x_train,y_train)
gnb.fit(x_train,y_train)

In [68]:
y_pred1 = mnb.predict(x_test)
y_pred2 = gnb.predict(x_test)

In [69]:
accuracy1 = accuracy_score(y_test,y_pred1)
accuracy2 = accuracy_score(y_test,y_pred2)
print(accuracy1)
print(accuracy2)

0.5160011067047865
0.22004980171539243


In [70]:
p1 = precision_score(y_test,y_pred1,average="macro")
p2 = precision_score(y_test,y_pred2,average="macro")
print(p1)
print(p2)

0.3378251299731216
0.14439466436477957


  _warn_prf(average, modifier, msg_start, len(result))
