<a href="https://colab.research.google.com/github/RiyaKhushiRadha/CodSoft-Internship-Projects/blob/main/MOVIE_GENRE_CLASSIFICATION.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import files

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
uploaded = files.upload()  # Upload genre-classification-dataset-imdb.zip

Saving archive.zip to archive (2).zip


In [None]:
!unzip "archive.zip"

Archive:  archive.zip
replace Genre Classification Dataset/description.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: Genre Classification Dataset/description.txt  
replace Genre Classification Dataset/test_data.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: Genre Classification Dataset/test_data.txt  
replace Genre Classification Dataset/test_data_solution.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: Genre Classification Dataset/test_data_solution.txt  
replace Genre Classification Dataset/train_data.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: Genre Classification Dataset/train_data.txt  


In [None]:
import os
print(os.listdir('Genre Classification Dataset'))

['description.txt', 'train_data.txt', 'test_data_solution.txt', 'test_data.txt']


In [None]:
train_df = pd.read_csv("Genre Classification Dataset/train_data.txt", sep="\t", encoding='utf-8', header=None, names=['raw'])
print(train_df.head())
print(train_df.columns)

                                                 raw
0  1 ::: Oscar et la dame rose (2009) ::: drama :...
1  2 ::: Cupid (1997) ::: thriller ::: A brother ...
2  3 ::: Young, Wild and Wonderful (1980) ::: adu...
3  4 ::: The Secret Sin (1915) ::: drama ::: To h...
4  5 ::: The Unrecovered (2007) ::: drama ::: The...
Index(['raw'], dtype='object')


In [None]:
# Split the raw column using ' ::: ' delimiter
train_df[['id', 'title', 'genre', 'plot']] = train_df['raw'].str.split(' ::: ', expand=True)

# Drop the original raw column
train_df.drop(columns=['raw'], inplace=True)

# Optional: convert id to integer
train_df['id'] = train_df['id'].astype(int)

# Show first few rows
print(train_df.head())

   id                             title     genre  \
0   1      Oscar et la dame rose (2009)     drama   
1   2                      Cupid (1997)  thriller   
2   3  Young, Wild and Wonderful (1980)     adult   
3   4             The Secret Sin (1915)     drama   
4   5            The Unrecovered (2007)     drama   

                                                plot  
0  Listening in to a conversation between his doc...  
1  A brother and sister with a past incestuous re...  
2  As the bus empties the students for their fiel...  
3  To help their unemployed father make ends meet...  
4  The film's title refers not only to the un-rec...  


In [None]:
# Encode genres
le = LabelEncoder()
train_df['genre_encoded'] = le.fit_transform(train_df['genre'])

In [None]:
import re

def clean_plot(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'[^a-z\s]', '', text)  # Remove special characters and digits
    return text

train_df['clean_plot'] = train_df['plot'].apply(clean_plot)

In [None]:
# TF-IDF vectorizer
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
X = tfidf.fit_transform(train_df['plot'])
y = train_df['genre_encoded']

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)

model = LogisticRegression(class_weight='balanced')

In [None]:
clf = LogisticRegression(class_weight='balanced', max_iter=1000)

In [None]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
print("Logistic Regression Accuracy: ", accuracy_score(y_test, y_pred_lr))

Logistic Regression Accuracy:  0.5807433367149313


In [None]:
nb = MultinomialNB()
nb.fit(X_train, y_train)
y_pred_nb = nb.predict(X_test)
print("Naive Bayes Accuracy:", accuracy_score(y_test, y_pred_nb))

Naive Bayes Accuracy: 0.5221802084294015


In [None]:
svm = LinearSVC()
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))

SVM Accuracy: 0.568016231670202


In [None]:
def predict_genre(plot_text):
    vec = tfidf.transform([plot_text])
    pred = lr.predict(vec)
    return le.inverse_transform(pred)[0]

# Example:
print(predict_genre("A young boy discovers he has magical powers and goes to a wizarding school."))
print(predict_genre("A team of astronauts is sent to colonize Mars but they discover unexpected lifeforms."))

animation
comedy
