<a href="https://colab.research.google.com/github/SoniReddyMaram/CODSOFT/blob/main/Movie_Genre_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import warnings
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

In [2]:
df_train = pd.read_csv("train_data.txt",sep=':::', names=['ID', 'TITLE', 'GENRE', 'DESCRIPTION'], engine='python')
df_test = pd.read_csv("test_data.txt",sep=':::', names=['ID', 'TITLE', 'DESCRIPTION'], engine='python')
df_test_solution= pd.read_csv("test_data_solution.txt",sep=':::', names=['ID', 'TITLE', 'GENRE', 'DESCRIPTION'], engine='python')

In [3]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54214 entries, 0 to 54213
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   ID           54214 non-null  int64 
 1   TITLE        54214 non-null  object
 2   GENRE        54214 non-null  object
 3   DESCRIPTION  54214 non-null  object
dtypes: int64(1), object(3)
memory usage: 1.7+ MB


In [4]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54200 entries, 0 to 54199
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   ID           54200 non-null  int64 
 1   TITLE        54200 non-null  object
 2   DESCRIPTION  54200 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.2+ MB


In [5]:
df_test_solution.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54200 entries, 0 to 54199
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   ID           54200 non-null  int64 
 1   TITLE        54200 non-null  object
 2   GENRE        54200 non-null  object
 3   DESCRIPTION  54200 non-null  object
dtypes: int64(1), object(3)
memory usage: 1.7+ MB


In [6]:
df_train=df_train.drop(columns=['ID'],axis=1)
df_test=df_test.drop(columns=['ID'],axis=1)

le = LabelEncoder()
df_train['GENRE'] = le.fit_transform(df_train['GENRE'])

df_test_solution['GENRE'] = le.fit_transform(df_test_solution['GENRE'])


df_train['combined_text'] = df_train['TITLE'] + ' ' + df_train['DESCRIPTION']
df_test['combined_text'] = df_test['TITLE'] + ' ' + df_test['DESCRIPTION']

X_train=df_train.drop(['GENRE','DESCRIPTION','TITLE'],axis=1)

X_test=df_test.drop(['DESCRIPTION','TITLE'],axis=1)

y_train=df_train['GENRE']
y_test=df_test_solution['GENRE']


tfidf_vectorizer = TfidfVectorizer(stop_words='english')

# Fit the vectorizer on X_train
tfidf_vectorizer.fit(X_train['combined_text'])

X_train = tfidf_vectorizer.transform(X_train['combined_text'])
X_test = tfidf_vectorizer.transform(X_test['combined_text'])


x_train,x_val,y_train,y_val=train_test_split(X_train,y_train,test_size=0.1)

In [9]:
#Logistic Regression
log_model=LogisticRegression(solver='lbfgs', max_iter=1000)
log_model.fit(x_train,y_train)
y_train_pred=log_model.predict(x_train)
print(classification_report(y_train,y_train_pred, zero_division=0))

              precision    recall  f1-score   support

           0       0.82      0.44      0.57      1188
           1       0.94      0.40      0.57       535
           2       0.90      0.26      0.41       703
           3       0.95      0.12      0.21       446
           4       0.00      0.00      0.00       249
           5       0.72      0.80      0.76      6664
           6       0.79      0.05      0.10       446
           7       0.74      0.95      0.83     11819
           8       0.65      0.92      0.77     12286
           9       0.84      0.18      0.30       719
          10       0.85      0.04      0.07       287
          11       0.97      0.54      0.70       180
          12       0.00      0.00      0.00       220
          13       0.83      0.77      0.80      1977
          14       0.88      0.55      0.67       669
          15       0.90      0.04      0.07       249
          16       1.00      0.03      0.06       283
          17       0.88    

In [10]:
y_val_pred=log_model.predict(x_val)
print(classification_report(y_val,y_val_pred, zero_division=0))

              precision    recall  f1-score   support

           0       0.56      0.21      0.31       127
           1       0.44      0.15      0.22        55
           2       0.38      0.04      0.07        72
           3       1.00      0.06      0.11        52
           4       0.00      0.00      0.00        16
           5       0.55      0.57      0.56       783
           6       1.00      0.02      0.03        59
           7       0.63      0.89      0.74      1277
           8       0.53      0.80      0.64      1327
           9       0.57      0.06      0.11        65
          10       0.00      0.00      0.00        36
          11       0.80      0.29      0.42        14
          12       0.00      0.00      0.00        23
          13       0.74      0.55      0.63       227
          14       0.64      0.45      0.53        62
          15       0.00      0.00      0.00        28
          16       0.00      0.00      0.00        36
          17       0.00    

In [12]:
y_test_pred=log_model.predict(X_test)
print(classification_report(y_test,y_test_pred, zero_division=0))

              precision    recall  f1-score   support

           0       0.54      0.25      0.34      1314
           1       0.62      0.17      0.27       590
           2       0.69      0.14      0.24       775
           3       0.67      0.01      0.02       498
           4       0.00      0.00      0.00       264
           5       0.55      0.58      0.56      7446
           6       0.50      0.02      0.03       505
           7       0.65      0.88      0.75     13096
           8       0.53      0.81      0.64     13612
           9       0.56      0.07      0.13       783
          10       1.00      0.01      0.02       322
          11       0.91      0.47      0.62       193
          12       0.00      0.00      0.00       243
          13       0.67      0.56      0.61      2204
          14       0.70      0.40      0.51       731
          15       0.50      0.00      0.01       276
          16       1.00      0.00      0.01       318
          17       0.73    

In [33]:
#SupportVector
svc_model=LinearSVC(dual=False)
svc_model.fit(x_train,y_train)

y_train_pred1=svc_model.predict(x_train)
print(classification_report(y_train,y_train_pred1, zero_division=0))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99      1188
           1       1.00      1.00      1.00       535
           2       1.00      0.99      0.99       703
           3       1.00      0.99      0.99       446
           4       1.00      0.94      0.97       249
           5       0.99      0.98      0.99      6664
           6       1.00      0.98      0.99       446
           7       0.98      0.99      0.99     11819
           8       0.97      0.99      0.98     12286
           9       0.99      0.97      0.98       719
          10       1.00      0.99      0.99       287
          11       1.00      1.00      1.00       180
          12       1.00      0.93      0.96       220
          13       0.99      1.00      0.99      1977
          14       1.00      0.99      1.00       669
          15       1.00      0.98      0.99       249
          16       1.00      0.98      0.99       283
          17       1.00    

In [27]:
y_val_pred1=svc_model.predict(x_val)
print(classification_report(y_val,y_val_pred1, zero_division=0))

              precision    recall  f1-score   support

           0       0.40      0.28      0.33       127
           1       0.56      0.36      0.44        55
           2       0.48      0.18      0.26        72
           3       0.54      0.13      0.22        52
           4       0.00      0.00      0.00        16
           5       0.56      0.58      0.57       783
           6       0.50      0.07      0.12        59
           7       0.68      0.85      0.76      1277
           8       0.56      0.73      0.64      1327
           9       0.31      0.17      0.22        65
          10       0.29      0.06      0.09        36
          11       0.64      0.50      0.56        14
          12       0.00      0.00      0.00        23
          13       0.71      0.66      0.68       227
          14       0.56      0.65      0.60        62
          15       0.40      0.07      0.12        28
          16       0.40      0.06      0.10        36
          17       0.33    

In [28]:
y_test_pred1=svc_model.predict(X_test)
print(classification_report(y_test,y_test_pred1))

              precision    recall  f1-score   support

           0       0.44      0.35      0.39      1314
           1       0.67      0.43      0.52       590
           2       0.52      0.24      0.33       775
           3       0.44      0.12      0.19       498
           4       0.00      0.00      0.00       264
           5       0.56      0.58      0.57      7446
           6       0.35      0.07      0.12       505
           7       0.69      0.84      0.75     13096
           8       0.57      0.72      0.64     13612
           9       0.42      0.16      0.24       783
          10       0.31      0.07      0.12       322
          11       0.77      0.62      0.69       193
          12       0.21      0.01      0.02       243
          13       0.62      0.64      0.63      2204
          14       0.62      0.56      0.59       731
          15       0.36      0.06      0.10       276
          16       0.38      0.06      0.10       318
          17       0.61    

In [37]:

def predict_genre(title, description, model, vectorizer, label_encoder):

    data = pd.DataFrame({'TITLE': [title], 'DESCRIPTION': [description]})

    data['combined_text'] = data['TITLE'] + ' ' + data['DESCRIPTION']

    X_new = vectorizer.transform(data['combined_text'])

    y_pred = model.predict(X_new)

    predicted_genre = label_encoder.inverse_transform(y_pred)[0]

    return predicted_genre