In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load Dataset

In [None]:
path = '/content/drive/MyDrive/Genre Classification Dataset'
train_data = pd.read_csv(path + '/train_data.txt', sep=':::', names=['Title', 'Genre', 'Description'], engine='python')
test_data = pd.read_csv(path + '/test_data.txt', sep=':::', names=['Title', 'Genre', 'Description'], engine='python')

In [None]:
train_data.describe()


Unnamed: 0,Title,Genre,Description
count,54214,54214,54214
unique,54214,27,54086
top,Oscar et la dame rose (2009),drama,Grammy - music award of the American academy ...
freq,1,13613,12


In [None]:
#Check for null value in train_data
train_data.isnull().sum()

Unnamed: 0,0
Title,0
Genre,0
Description,0


# EDA & Visualization

In [None]:
import plotly.express as px

# Assuming 'train_data' is a pandas DataFrame
counts = train_data['Genre'].value_counts()

# Creating the bar plot
fig = px.bar(
    x=counts.index,
    y=counts.values,
    labels={'x': 'Genre', 'y': 'Count'},
    #title='Distribution of Genres',
    color=counts.values,  # Optional: Add a color scale based on count values
    color_continuous_scale='Viridis'
)

# Updating layout for a similar style
fig.update_layout(
    xaxis_title='Genre',
    yaxis_title='Count',
    title=dict(
        text='Distribution of Genres',  # Chart title
        x=0.5,  # Center the title
        xanchor='center',  # Ensure proper alignment
        yanchor='top',  # Keep title at the top
        font=dict(
            family='Georgia Black, sans-serif',  # Title font
            size=20,  # Title font size
            color='black'  # Title font color
        )),
    xaxis=dict(tickangle=90),
    font=dict(size=14),
    plot_bgcolor='lightgrey',

    coloraxis_showscale=False  # Remove color scale if not needed

)

fig.show()


# Data Preprocessing and Text Cleaning

In [None]:
nltk.download('stopwords')
stemmer = LancasterStemmer()
stop_words = set(stopwords.words('english'))
nltk.download('punkt_tab')
# Define the clean_text function
def clean_text(text):
    text = text.lower()  # Lowercase all characters
    text = re.sub(r'@\S+', '', text)  # Remove Twitter handles
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'pic.\S+', '', text)
    text = re.sub(r"[^a-zA-Z+']", ' ', text)  # Keep only characters
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text + ' ')  # Keep words with length > 1 only
    text = "".join([i for i in text if i not in string.punctuation])
    words = nltk.word_tokenize(text)
    stopwords = nltk.corpus.stopwords.words('english')  # Remove stopwords
    text = " ".join([i for i in words if i not in stopwords and len(i) > 2])
    text = re.sub("\s[\s]+", " ", text).strip()  # Remove repeated/leading/trailing spaces
    return text


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [None]:
# Apply the clean_text function to the 'Description' column in the training and test data
train_data['Text_cleaning'] = train_data['Description'].apply(clean_text)
test_data['Text_cleaning'] = test_data['Description'].apply(clean_text)

In [None]:
train_data.head()

Unnamed: 0,Title,Genre,Description,Text_cleaning
1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...,listening conversation doctor parents year old...
2,Cupid (1997),thriller,A brother and sister with a past incestuous r...,brother sister past incestuous relationship cu...
3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...,bus empties students field trip museum natural...
4,The Secret Sin (1915),drama,To help their unemployed father make ends mee...,help unemployed father make ends meet edith tw...
5,The Unrecovered (2007),drama,The film's title refers not only to the un-re...,films title refers recovered bodies ground zer...


# Text Vectorization Using TF-IDF

In [None]:
# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the training data
X_train = tfidf_vectorizer.fit_transform(train_data['Text_cleaning'])

# Transform the test data
X_test = tfidf_vectorizer.transform(test_data['Text_cleaning'])

# Splitting Data in Train @ Validation


In [None]:
X = X_train
y = train_data['Genre']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)

# Model Builing

In [None]:
# Initialize and train a Multinomial Naive Bayes classifier
clf1 = MultinomialNB()
clf1.fit(X_train, y_train)

In [None]:
clf2 = RandomForestClassifier(n_estimators=150,max_depth=50)
clf2.fit(X_train, y_train)

In [None]:
# Make predictions on the validation set
y_pred = clf1.predict(X_val)

# Evaluate the performance of the model
accuracy = accuracy_score(y_val, y_pred)
print("Validation Accuracy:", accuracy)
print("model score  ",clf1.score(X_train, y_train))

Validation Accuracy: 0.44340608668920994
model score   0.46799652164747424


In [None]:
# Make predictions on the validation set
y_pred = clf2.predict(X_val)

# Evaluate the performance of the model
accuracy = accuracy_score(y_val, y_pred)
print("Validation Accuracy:", accuracy)
print("model score  ",clf2.score(X_train, y_train))

Validation Accuracy: 0.45059944666461726
model score   0.5912672270679069


# Make Predictions on the Test Data

In [None]:
# Use the trained model to make predictions on the test data
X_test_predictions = clf.predict(X_test)
test_data['Predicted_Genre'] = X_test_predictions
# Save the test_data DataFrame with predicted genres to a CSV file
test_data.to_csv('predicted_genres.csv', index=False)


In [None]:
test_data.head()

Unnamed: 0,Title,Genre,Description,Text_cleaning,Predicted_Genre
0,1,Edgar's Lunch (1998),"L.R. Brane loves his life - his car, his apar...",brane loves life car apartment job especially ...,drama
1,2,La guerra de papá (1977),"Spain, March 1964: Quico is a very naughty ch...",spain march quico naughty child three belongin...,drama
2,3,Off the Beaten Track (2010),One year in the life of Albin and his family ...,one year life albin family shepherds north tra...,documentary
3,4,Meu Amigo Hindu (2015),"His father has died, he hasn't spoken with hi...",father died hasnt spoken brother years serious...,drama
4,5,Er nu zhai (1955),Before he was known internationally as a mart...,known internationally martial arts superstar b...,drama
