In [338]:
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

In [214]:
train_df=pd.read_csv("train_data.txt", sep=":::", engine="python", names=["id","title","genre","description"])
test_df=pd.read_csv("train_data.txt", sep=":::", engine="python", names=["id","title","genre","description"])

In [215]:
test_df

Unnamed: 0,id,title,genre,description
0,1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...
1,2,Cupid (1997),thriller,A brother and sister with a past incestuous r...
2,3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...
3,4,The Secret Sin (1915),drama,To help their unemployed father make ends mee...
4,5,The Unrecovered (2007),drama,The film's title refers not only to the un-re...
...,...,...,...,...
54209,54210,"""Bonino"" (1953)",comedy,This short-lived NBC live sitcom centered on ...
54210,54211,Dead Girls Don't Cry (????),horror,The NEXT Generation of EXPLOITATION. The sist...
54211,54212,Ronald Goedemondt: Ze bestaan echt (2008),documentary,"Ze bestaan echt, is a stand-up comedy about g..."
54212,54213,Make Your Own Bed (1944),comedy,Walter and Vivian live in the country and hav...


In [216]:
train_df.isnull().sum()

id             0
title          0
genre          0
description    0
dtype: int64

In [217]:
train_df.dtypes

id              int64
title          object
genre          object
description    object
dtype: object

In [218]:
train_df["movie_input"]=train_df["title"]+" "+train_df["description"]
test_df["movie_input"]=test_df["title"]+" "+test_df["description"]

In [219]:
def text_lower(movie_input):
    movie_input=movie_input.lower()
    movie_input=re.sub(r'[^a-zA-Z0-9\s]'," ", movie_input)
    
    return movie_input.strip()
train_df["movie_input"]=train_df["movie_input"].apply(text_lower)
test_df["movie_input"]=test_df["movie_input"].apply(text_lower)

In [220]:
test_df

Unnamed: 0,id,title,genre,description,movie_input
0,1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...,oscar et la dame rose 2009 listening in to...
1,2,Cupid (1997),thriller,A brother and sister with a past incestuous r...,cupid 1997 a brother and sister with a pas...
2,3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...,young wild and wonderful 1980 as the bus ...
3,4,The Secret Sin (1915),drama,To help their unemployed father make ends mee...,the secret sin 1915 to help their unemploy...
4,5,The Unrecovered (2007),drama,The film's title refers not only to the un-re...,the unrecovered 2007 the film s title refe...
...,...,...,...,...,...
54209,54210,"""Bonino"" (1953)",comedy,This short-lived NBC live sitcom centered on ...,bonino 1953 this short lived nbc live sit...
54210,54211,Dead Girls Don't Cry (????),horror,The NEXT Generation of EXPLOITATION. The sist...,dead girls don t cry the next generat...
54211,54212,Ronald Goedemondt: Ze bestaan echt (2008),documentary,"Ze bestaan echt, is a stand-up comedy about g...",ronald goedemondt ze bestaan echt 2008 ze...
54212,54213,Make Your Own Bed (1944),comedy,Walter and Vivian live in the country and hav...,make your own bed 1944 walter and vivian l...


In [221]:
train_df=train_df.drop(columns=["id","title","description"])
test_df=test_df.drop(columns=["id","title","description"])

In [224]:
test_df

Unnamed: 0,genre,movie_input
0,drama,oscar et la dame rose 2009 listening in to...
1,thriller,cupid 1997 a brother and sister with a pas...
2,adult,young wild and wonderful 1980 as the bus ...
3,drama,the secret sin 1915 to help their unemploy...
4,drama,the unrecovered 2007 the film s title refe...
...,...,...
54209,comedy,bonino 1953 this short lived nbc live sit...
54210,horror,dead girls don t cry the next generat...
54211,documentary,ronald goedemondt ze bestaan echt 2008 ze...
54212,comedy,make your own bed 1944 walter and vivian l...


In [228]:
train_df["genre"].nunique()

27

In [229]:
encoder=LabelEncoder()

In [256]:
y_train=encoder.fit_transform(train_df["genre"])
y_test=encoder.transform(test_df["genre"])

In [258]:
test_df

Unnamed: 0,genre,movie_input,label_encoded
0,drama,oscar et la dame rose 2009 listening in to...,8
1,thriller,cupid 1997 a brother and sister with a pas...,24
2,adult,young wild and wonderful 1980 as the bus ...,1
3,drama,the secret sin 1915 to help their unemploy...,8
4,drama,the unrecovered 2007 the film s title refe...,8
...,...,...,...
54209,comedy,bonino 1953 this short lived nbc live sit...,5
54210,horror,dead girls don t cry the next generat...,13
54211,documentary,ronald goedemondt ze bestaan echt 2008 ze...,7
54212,comedy,make your own bed 1944 walter and vivian l...,5


In [260]:
train_df.drop(columns=["genre"])
test_df.drop(columns=["genre"])

Unnamed: 0,movie_input,label_encoded
0,oscar et la dame rose 2009 listening in to...,8
1,cupid 1997 a brother and sister with a pas...,24
2,young wild and wonderful 1980 as the bus ...,1
3,the secret sin 1915 to help their unemploy...,8
4,the unrecovered 2007 the film s title refe...,8
...,...,...
54209,bonino 1953 this short lived nbc live sit...,5
54210,dead girls don t cry the next generat...,13
54211,ronald goedemondt ze bestaan echt 2008 ze...,7
54212,make your own bed 1944 walter and vivian l...,5


In [262]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54214 entries, 0 to 54213
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   genre          54214 non-null  object
 1   movie_input    54214 non-null  object
 2   label_encoded  54214 non-null  int32 
dtypes: int32(1), object(2)
memory usage: 1.0+ MB


In [349]:
vectorizer= TfidfVectorizer(max_features=5000,  ngram_range=(1,2), stop_words='english')
X_train=vectorizer.fit_transform(train_df["movie_input"])
X_test=vectorizer.transform(test_df["movie_input"])

In [350]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((54214, 5000), (54214,), (54214, 5000), (54214,))

In [351]:
model=LinearSVC(class_weight="balanced")

In [352]:
model.fit(X_train, y_train)



In [353]:
y_pred=model.predict(X_test)

In [368]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.62      0.89      0.73      1315
           1       0.69      0.99      0.82       590
           2       0.62      0.97      0.75       775
           3       0.65      0.99      0.79       498
           4       0.58      0.99      0.73       265
           5       0.78      0.66      0.71      7447
           6       0.54      0.99      0.70       505
           7       0.88      0.80      0.84     13096
           8       0.85      0.61      0.71     13613
           9       0.52      0.95      0.68       784
          10       0.57      1.00      0.72       323
          11       0.89      1.00      0.94       194
          12       0.65      1.00      0.79       243
          13       0.77      0.94      0.85      2204
          14       0.71      0.99      0.83       731
          15       0.64      1.00      0.78       277
          16       0.62      1.00      0.77       319
          17       0.73    

In [378]:
import joblib
joblib.dump(model, "genre_model.pkl")

['genre_model.pkl']

In [380]:
joblib.dump(vectorizer, "movie_vectorizer.pkl")

['movie_vectorizer.pkl']