# Task: MOVIE GENRE CLASSIFICATION
    Create a machine learning model that can predict the genre of a
    movie based on its plot summary or other textual information. You
    can use techniques like TF-IDF or word embeddings with classifiers
    such as Naive Bayes, Logistic Regression, or Support Vector
    Machines

## Methodologies

###    1. Data Collection
###    2. Data Cleaning and Preprocessing
###    3. Data Visualization
###    4. Feature Engineering
###    5. Model Selection
###    6. Model Training and Evaluation

## Data Collection: Data was collected from https://www.kaggle.com/code/dhruvtibarewal/movie-genre-classification

## Data Cleaning and Preprocessing

In [3]:
# libraries
import pandas as pd

In [4]:
train_data = pd.read_csv("C:/Users/susha/Downloads/archive (7)/Genre Classification Dataset/train_data.txt", delimiter=':::', names = ['Sno', 'Name', 'Genre', 'Description'] ,engine='python')
test_data = pd.read_csv("C:/Users/susha/Downloads/archive (7)/Genre Classification Dataset/test_data.txt", delimiter = ':::', names = ['Sno', 'Name', 'Description'], engine='python')
test_data_solution = pd.read_csv("C:/Users/susha/Downloads/archive (7)/Genre Classification Dataset/test_data_solution.txt", delimiter=':::', names = ['Sno', 'Name', 'Genre', 'Description'] ,engine='python')

In [5]:
train_data.head()
test_data.head()
test_data_solution.tail()

Unnamed: 0,Sno,Name,Genre,Description
54195,54196,"""Tales of Light & Dark"" (2013)",horror,"Covering multiple genres, Tales of Light & Da..."
54196,54197,Der letzte Mohikaner (1965),western,As Alice and Cora Munro attempt to find their...
54197,54198,Oliver Twink (2007),adult,A movie 169 years in the making. Oliver Twist...
54198,54199,Slipstream (1973),drama,"Popular, but mysterious rock D.J Mike Mallard..."
54199,54200,Curitiba Zero Grau (2010),drama,"Curitiba is a city in movement, with rhythms ..."


#### Looking for null values

In [6]:
#looking for null values

train_data.info()
print('\n')
test_data.info()
print('\n')
test_data_solution.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54214 entries, 0 to 54213
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Sno          54214 non-null  int64 
 1   Name         54214 non-null  object
 2   Genre        54214 non-null  object
 3   Description  54214 non-null  object
dtypes: int64(1), object(3)
memory usage: 1.7+ MB


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54200 entries, 0 to 54199
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Sno          54200 non-null  int64 
 1   Name         54200 non-null  object
 2   Description  54200 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.2+ MB


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54200 entries, 0 to 54199
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Sno          54200 non-null  int64

In [7]:
train_data.isna().sum()

test_data.isna().sum()

test_data_solution.isna().sum()

Sno            0
Name           0
Genre          0
Description    0
dtype: int64

#### Looking for duplicates

In [8]:
train_data.duplicated().sum()
test_data.duplicated().sum()
test_data_solution.duplicated().sum()

0

#### Data Cleaning is done.

## Futher preprocessing
#### The dataset here is almost 50% training  and 50% testing. 
#### This is far from the optimal ratio that will yield in a better working model. 
#### So we will be splitting the dataset in the format 70-15-15 for training, validation and testing respectively.

In [9]:
#code below adds the first 37700 data from test_data_solution into the train datasets and removes those respective data from itself and test_data

last_sno = train_data['Sno'].max()
print(last_sno)

rows_to_append = test_data_solution.head(37700).copy()  # Make a copy to avoid modifying the original DataFrame
rows_to_append.loc[:, 'Sno'] += last_sno + 1  # Use .loc to modify the DataFrame safely

print(rows_to_append)


train_data = train_data.append(rows_to_append)

54214
         Sno                                    Name          Genre  \
0      54216                   Edgar's Lunch (1998)       thriller    
1      54217               La guerra de papá (1977)         comedy    
2      54218            Off the Beaten Track (2010)    documentary    
3      54219                 Meu Amigo Hindu (2015)          drama    
4      54220                      Er nu zhai (1955)          drama    
...      ...                                     ...            ...   
37695  91911                    Fully Loaded (2011)         comedy    
37696  91912                    Tenebrae Lux (2014)         sci-fi    
37697  91913                   Mexican Dance (1898)          short    
37698  91914   Das Lied von den zwei Pferden (2009)    documentary    
37699  91915                  Doin' It Again (2012)    documentary    

                                             Description  
0       L.R. Brane loves his life - his car, his apar...  
1       Spain, March 19

  train_data = train_data.append(rows_to_append)


In [10]:
test_data = test_data.drop(rows_to_append.index)
test_data_solution = test_data_solution.drop(rows_to_append.index)

In [11]:
train_data

Unnamed: 0,Sno,Name,Genre,Description
0,1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...
1,2,Cupid (1997),thriller,A brother and sister with a past incestuous r...
2,3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...
3,4,The Secret Sin (1915),drama,To help their unemployed father make ends mee...
4,5,The Unrecovered (2007),drama,The film's title refers not only to the un-re...
...,...,...,...,...
37695,91911,Fully Loaded (2011),comedy,"On a rare evening out, two feisty single moms..."
37696,91912,Tenebrae Lux (2014),sci-fi,A lone traveler with the ability to cross bet...
37697,91913,Mexican Dance (1898),short,"""Another well-known dancer with a national re..."
37698,91914,Das Lied von den zwei Pferden (2009),documentary,"A promise, an old, destroyed horse head violi..."


In [12]:
test_data

Unnamed: 0,Sno,Name,Description
37700,37701,My Lips Betray (1933),"In a make-believe, mittleuropean kingdom, a v..."
37701,37702,The Koreas (2016),"At the end of World War II, Korea was divided..."
37702,37703,Come Together (2016),Colombia is coming out of a period in their h...
37703,37704,With Honors Denied (2003),Japanese bombs hit Pearl Harbor on a Sunday. ...
37704,37705,"""Connect with English"" (2007)",Connect with English is a series that brings ...
...,...,...,...
54195,54196,"""Tales of Light & Dark"" (2013)","Covering multiple genres, Tales of Light & Da..."
54196,54197,Der letzte Mohikaner (1965),As Alice and Cora Munro attempt to find their...
54197,54198,Oliver Twink (2007),A movie 169 years in the making. Oliver Twist...
54198,54199,Slipstream (1973),"Popular, but mysterious rock D.J Mike Mallard..."


In [13]:
test_data_solution

Unnamed: 0,Sno,Name,Genre,Description
37700,37701,My Lips Betray (1933),musical,"In a make-believe, mittleuropean kingdom, a v..."
37701,37702,The Koreas (2016),documentary,"At the end of World War II, Korea was divided..."
37702,37703,Come Together (2016),documentary,Colombia is coming out of a period in their h...
37703,37704,With Honors Denied (2003),short,Japanese bombs hit Pearl Harbor on a Sunday. ...
37704,37705,"""Connect with English"" (2007)",drama,Connect with English is a series that brings ...
...,...,...,...,...
54195,54196,"""Tales of Light & Dark"" (2013)",horror,"Covering multiple genres, Tales of Light & Da..."
54196,54197,Der letzte Mohikaner (1965),western,As Alice and Cora Munro attempt to find their...
54197,54198,Oliver Twink (2007),adult,A movie 169 years in the making. Oliver Twist...
54198,54199,Slipstream (1973),drama,"Popular, but mysterious rock D.J Mike Mallard..."


In [14]:
train_data.

SyntaxError: invalid syntax (1541890828.py, line 1)

In [15]:
train_data.iloc[54201]

Sno                                                        54202
Name                                        Singing Guns (1950) 
Genre                                                   western 
Description     Rhiannon, an outlaw who regularly robs gold f...
Name: 54201, dtype: object

In [16]:
test_data_solution.reset_index(drop=True, inplace=True)
test_data_solution.index += 1

In [17]:
test_data_solution.columns

Index(['Sno', 'Name', 'Genre', 'Description'], dtype='object')

In [18]:
test_data.reset_index(drop=True, inplace=True)
test_data.index += 1

In [19]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

In [20]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\susha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\susha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [21]:
def preprocess_text(text):
    
    # Remove special characters, punctuation, and symbols
    text = re.sub(r'[^\w\s]', '', text)

    # Tokenization
    tokens = word_tokenize(text.lower())
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    
    # Stemming
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
    
    # Join tokens back into a string
    preprocessed_text = ' '.join(stemmed_tokens)
    
    return preprocessed_text


# Apply preprocessing to 'Description' column
train_data['Description'] = train_data['Description'].apply(preprocess_text)
test_data['Description'] = test_data['Description'].apply(preprocess_text)
test_data_solution['Description'] = test_data_solution['Description'].apply(preprocess_text)


In [22]:
X_train = train_data['Description']
y_train = train_data['Genre']
X_test = test_data['Description']
y_test_solution = test_data_solution['Genre']

In [23]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_solution_encoded = label_encoder.transform(y_test_solution)

### Using TF-IDF vectorization

In [24]:
vectorizer = TfidfVectorizer(max_features=100)  
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [25]:
model = MLPClassifier()

# Train the model
model.fit(X_train_vectorized, y_train_encoded)

# Predict on test data
y_pred = model.predict(X_test_vectorized)

# Calculate accuracy
accuracy = accuracy_score(y_test_solution_encoded, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.42587878787878786




In [97]:
max_tokens = train_data['Description'].apply(lambda x: len(word_tokenize(x))).max()
max_tokens


1479

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Bidirectional, LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import numpy as np
print("hello world")

In [102]:
vectorizer = TfidfVectorizer(max_features=1000) 

# Fit and transform the training and test text data
X_train_vectorized = vectorizer.fit_transform(train_data['Description'])
X_test_vectorized = vectorizer.transform(test_data['Description'])

# Convert sparse matrices to dense arrays
X_train_dense = X_train_vectorized.toarray()
X_test_dense = X_test_vectorized.toarray()

# Define maximum sequence length based on the maximum number of tokens
max_length = 800  

# Pad sequences to ensure uniform length
X_train_padded = pad_sequences(X_train_dense, maxlen=max_length, padding='post')
X_test_padded = pad_sequences(X_test_dense, maxlen=max_length, padding='post')

In [106]:
model = Sequential([
    Embedding(input_dim=len(vectorizer.vocabulary_), output_dim=100, input_length=max_length),
    Bidirectional(LSTM(64)),
    Dense(len(np.unique(y_train_encoded)), activation='softmax')
])



In [107]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


In [None]:
model.fit(X_train_padded, y_train_encoded, epochs=10, batch_size=32, validation_split=0.1)


Epoch 1/10

### Using Word2Vec vectorization

In [30]:
from gensim.models import Word2Vec
import numpy as np

In [31]:
# Train Word2Vec model
word2vec_model = Word2Vec(sentences=train_data['Description'], vector_size=100, window=5, min_count=1, workers=4)

# Function to calculate document vectors
def calculate_doc_vector(tokens):
    vectors = [word2vec_model.wv[word] for word in tokens if word in word2vec_model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(word2vec_model.vector_size)


In [32]:
# Calculate document vectors for train and test data
X_train_vectors = np.array([calculate_doc_vector(tokens) for tokens in train_data['Description']])
X_test_vectors = np.array([calculate_doc_vector(tokens) for tokens in test_data['Description']])

# Encode labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(train_data['Genre'])
y_test_solution_encoded = label_encoder.transform(test_data_solution['Genre'])


In [33]:
model = MLPClassifier()

# Train the model
model.fit(X_train_vectors, y_train_encoded)

# Predict on test data
y_pred = model.predict(X_test_vectors)



In [34]:
accuracy = accuracy_score(y_test_solution_encoded, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.3626666666666667
