<a href="https://colab.research.google.com/github/SanthanaNarayanan-git/Codsoft_intern_projects/blob/main/Movie_genre_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
from tqdm import tqdm


In [2]:
try:
    with tqdm(total = 50, desc = "Loading Train Data") as pbar:
        train_data = pd.read_csv('train_data.txt',sep=":::",header=None, names = ['SerialNumber','movieName','genre','plot'],engine = 'python')
        pbar.update(50)
except Exception as e:
    print(f"Failed Loading train file: {e}")
    raise
print(train_data['genre'].head())

Loading Train Data: 100%|██████████| 50/50 [00:00<00:00, 83.71it/s]

0        drama 
1     thriller 
2        adult 
3        drama 
4        drama 
Name: genre, dtype: object





In [3]:
genre_list = train_data['genre'].unique()
fallback_genre = 'Unknown'
genre_list


array([' drama ', ' thriller ', ' adult ', ' documentary ', ' comedy ',
       ' crime ', ' reality-tv ', ' horror ', ' sport ', ' animation ',
       ' action ', ' fantasy ', ' short ', ' sci-fi ', ' music ',
       ' adventure ', ' talk-show ', ' western ', ' family ', ' mystery ',
       ' history ', ' news ', ' biography ', ' romance ', ' game-show ',
       ' musical ', ' war '], dtype=object)

In [4]:
X_train = train_data['plot'].astype(str).apply(lambda doc:doc.lower())
genre_labels = [genre.split(', ') for genre in train_data["genre"]]
mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(genre_labels)


In [5]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)


In [6]:
with tqdm(total = 50, desc = "vecorizing training data") as pbar:
    X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
    pbar.update(50)

with tqdm(total = 50, desc = "Training Model") as pbar:
    naive_bayes = MultinomialNB()
    multi_output_classifier = MultiOutputClassifier(naive_bayes)
    multi_output_classifier.fit(X_train_tfidf,y_train)
    pbar.update(50)

vecorizing training data: 100%|██████████| 50/50 [00:05<00:00,  8.70it/s]
Training Model: 100%|██████████| 50/50 [00:01<00:00, 44.55it/s]


In [7]:
try:
    with tqdm(total = 50, desc = "Loading Test Data") as pbar:
        test_data = pd.read_csv('test_data.txt',sep=":::",header=None, names = ['SerialNumber','movieName','plot'],engine = 'python')
        pbar.update(50)
except Exception as e:
    print(f"Failed Loading test file: {e}")
    raise

Loading Test Data: 100%|██████████| 50/50 [00:00<00:00, 121.67it/s]


In [8]:
X_test = test_data['plot'].astype(str).apply(lambda doc:doc.lower())
X_test

Unnamed: 0,plot
0,"l.r. brane loves his life - his car, his apar..."
1,"spain, march 1964: quico is a very naughty ch..."
2,one year in the life of albin and his family ...
3,"his father has died, he hasn't spoken with hi..."
4,before he was known internationally as a mart...
...,...
54195,"covering multiple genres, tales of light & da..."
54196,as alice and cora munro attempt to find their...
54197,a movie 169 years in the making. oliver twist...
54198,"popular, but mysterious rock d.j mike mallard..."


In [9]:
with tqdm(total = 50, desc = "vecorizing test data") as pbar:
    X_test_tfidf = tfidf_vectorizer.transform(X_test)
    pbar.update(50)

with tqdm(total = 50, desc = "Test Model") as pbar:
    y_pred = multi_output_classifier.predict(X_test_tfidf)
    pbar.update(50)

vecorizing test data: 100%|██████████| 50/50 [00:05<00:00,  9.65it/s]
Test Model: 100%|██████████| 50/50 [00:00<00:00, 92.46it/s]


In [11]:
test_movie_names = test_data['movieName']
PREDICTED_GENRES = mlb.inverse_transform(y_pred)
test_results = pd.DataFrame({'movieName':test_movie_names,"PREDICTED_GENRES":PREDICTED_GENRES})
test_results

Unnamed: 0,movieName,PREDICTED_GENRES
0,Edgar's Lunch (1998),()
1,La guerra de papá (1977),"( drama ,)"
2,Off the Beaten Track (2010),"( documentary ,)"
3,Meu Amigo Hindu (2015),"( drama ,)"
4,Er nu zhai (1955),()
...,...,...
54195,"""Tales of Light & Dark"" (2013)",()
54196,Der letzte Mohikaner (1965),()
54197,Oliver Twink (2007),()
54198,Slipstream (1973),()


In [12]:
with open("model_evalutiion.txt","w",encoding="utf-8") as output_file:
    for _, row in test_results.iterrows():
        movie_name = row["movieName"]
        genre_str = ', '.join(row["PREDICTED_GENRES"])
        output_file.write(f"{movie_name}:::{genre_str}\n")

In [14]:
y_train_pred = multi_output_classifier.predict(X_train_tfidf)
y_train_pred

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [15]:
accuracy = accuracy_score(y_train,y_train_pred)
precision = precision_score(y_train, y_train_pred, average = 'micro')
recall = recall_score(y_train, y_train_pred, average='micro')
f1 = f1_score(y_train,y_train_pred,average = 'micro')
f1

0.4041386088642733

In [16]:
accuracy

0.2773453351532814

In [17]:
precision

0.7218294823741519

In [18]:
recall

0.28062861991367544

In [19]:
f1

0.4041386088642733