In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [2]:
train_movie_data = pd.read_csv('../Datasets/Genre Classification Dataset/train_data.txt', header=None, sep=':::', engine='python')
header = ['id', 'title', 'genre', 'description']
train_movie_data.columns = header
train_movie_data.drop(columns='id', inplace=True)
print(train_movie_data.head())
train_movie_data.describe()

                                title       genre  \
0       Oscar et la dame rose (2009)       drama    
1                       Cupid (1997)    thriller    
2   Young, Wild and Wonderful (1980)       adult    
3              The Secret Sin (1915)       drama    
4             The Unrecovered (2007)       drama    

                                         description  
0   Listening in to a conversation between his do...  
1   A brother and sister with a past incestuous r...  
2   As the bus empties the students for their fie...  
3   To help their unemployed father make ends mee...  
4   The film's title refers not only to the un-re...  


Unnamed: 0,title,genre,description
count,54214,54214,54214
unique,54214,27,54086
top,Nature's Fury: Storm of the Century (2006),drama,Grammy - music award of the American academy ...
freq,1,13613,12


In [3]:
train_movie_data.loc[train_movie_data.duplicated(subset='description', keep=False), 'description'].value_counts()

description
Grammy - music award of the American academy of the audio recording, was founded by the American association of sound-recording companies of March, 14, 1958. Grammy is awarded annually as a result of voting by the authorized members of "Recording Academy".                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            

In [4]:
train_movie_data.drop_duplicates(subset='description', inplace=True)
train_movie_data.loc[train_movie_data.duplicated(subset='description', keep=False), 'description'].value_counts()

Series([], Name: count, dtype: int64)

In [5]:
X_full = train_movie_data.drop(columns=['title', 'genre'])
y_full = train_movie_data['genre']
print(X_full.head())
y_full.head()

                                         description
0   Listening in to a conversation between his do...
1   A brother and sister with a past incestuous r...
2   As the bus empties the students for their fie...
3   To help their unemployed father make ends mee...
4   The film's title refers not only to the un-re...


0        drama 
1     thriller 
2        adult 
3        drama 
4        drama 
Name: genre, dtype: object

In [6]:
def clean_text(text: str):
    text = text.lower()
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'[\w\.-]+@[\w\.-]+',"",text)
    text = re.sub(r'@\S+','',text)
    text = re.sub('[^a-z+]', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = re.sub('\n', ' ', text)
    token = text.split()
    text = ' '.join([word for word in token if word not in stopwords.words('english')])
    return text


In [None]:
X_full['description'] = X_full['description'].apply(clean_text)
X_full.to_csv('cleaned_data.csv')
X_full

In [12]:
X_full = pd.read_csv('cleaned_data.csv', index_col=0)
X_full.head()

Unnamed: 0,description
0,listening conversation doctor parents year old...
1,brother sister past incestuous relationship cu...
2,bus empties students field trip museum natural...
3,help unemployed father make ends meet edith tw...
4,film title refers un recovered bodies ground z...


In [13]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X_full, y_full, test_size=0.1, random_state=3)
X_train, X_val,y_train, y_val

(                                             description
 43932  islamic revolution iran brought twenty five hu...
 29122  deception dallas follows trail intrigue decept...
 41318  experienced perhaps greatest crimes humanity w...
 38106  maheen sana askari faces many challenges life ...
 45113  film made italian television series eight epis...
 ...                                                  ...
 25587  small rural town live akheela family consistin...
 48157  end anton krastev dop wife diana separated iro...
 11520  married couple seeking break hectic lives esca...
 1688   young people spend average hours day cell phon...
 5996   euphoria last relative safety life raft namele...
 
 [48677 rows x 1 columns],
                                              description
 53640  suspended teaching job horrendous behavior out...
 582    slightly used comedy takes place battle ground...
 9118   magician competes international magic competit...
 3817   mountains montenegro people lived s

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=10000)
X_train = tfidf.fit_transform(X_train['description'])
X_val = tfidf.transform(X_val['description'])
X_train, X_val

(<Compressed Sparse Row sparse matrix of dtype 'float64'
 	with 2037400 stored elements and shape (48677, 10000)>,
 <Compressed Sparse Row sparse matrix of dtype 'float64'
 	with 224338 stored elements and shape (5409, 10000)>)

In [15]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_val = le.transform(y_val)
y_train, y_val

(array([21,  8,  7, ...,  5,  7,  7]), array([ 5,  5,  5, ...,  8, 21,  8]))

In [None]:
from sklearn.svm import SVC
svc = SVC()
svc.fit(X_train, y_train)