In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')
import string
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC 
from sklearn.metrics import accuracy_score, classification_report

In [None]:
df = pd.read_csv(r'...\imdb_movie_dataset.csv')#enter the path of the dataset athe place of ...
df = pd.DataFrame(df[["Title", "Genre", "Description"]])
print(df.isnull().sum())
df.head()

In [None]:
all_genres = []
for index in df.index:
    genres_list = df.loc[index, "Genre"].replace(" ", "").split(",")
    for genre in genres_list:
        all_genres.append(genre)
genre_counts = pd.Series(all_genres).value_counts().to_dict()
genre_counts


In [None]:
plt.figure(figsize=(14, 7)) 
sns.countplot(data=df, y='Genre', order=df['Genre'].value_counts().index, palette='viridis') 
plt.xlabel('Count', fontsize=14, fontweight='bold') 
plt.ylabel('Genre', fontsize=14, fontweight='bold') 
plt.tight_layout()
plt.show()
plt.figure(figsize=(10, 6))
plt.bar(genre_counts.keys(), genre_counts.values())
plt.xlabel("Genre")
plt.ylabel("Frequency")
plt.title("Genre Frequencies")
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
df.shape


In [None]:
df.describe()

In [None]:
df.info()

In [None]:
print(df.isnull().sum())


In [None]:
stemmer = LancasterStemmer()
stop_words = set(stopwords.words('english'))
type(df)
test_data = pd.DataFrame({'Description'})
df['Text_Cleaning'] =(df['Description'])
df['Text_Cleaning'] = df['Text_Cleaning'].apply(str)
df

In [None]:
x = df['Text_Cleaning']
y = df['Genre']
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
df['length_Text_cleaning'] = len(xtrain)
plt.figure(figsize=(8, 7))
sns.histplot(data=df, x='length_Text_cleaning', bins=20, kde=True, color='blue')
plt.xlabel('Length', fontsize=14, fontweight='bold')
plt.ylabel('Frequency', fontsize=14, fontweight='bold')
plt.title('Distribution of Lengths', fontsize=16, fontweight='bold')
plt.show()

In [None]:
print(xtrain.tolist())

In [None]:
from collections import Counter
def create_features(text_data, all_words=None):
    if not isinstance(text_data, list):
        text_data = text_data.tolist()
    if all_words is None:
        all_words = set()
        for text in text_data:
            if isinstance(text, str):
                words = text.split()
                all_words.update(words)
    else:
        all_words = set(all_words)
    word_counts = {}
    for i, text in enumerate(text_data):
        if isinstance(text, str):
            words = text.split()
            word_counts[i] = Counter(words)
    features_df = pd.DataFrame.from_dict(word_counts, orient='index').fillna(0)
    features_df = features_df.reindex(columns=list(all_words), fill_value=0)
    return features_df
x = df['Text_Cleaning']
y = df['Genre']
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2, random_state=42)
all_words_in_dataset = set()
for text in x:
    if isinstance(text, str):
        all_words_in_dataset.update(text.split())
X_train = create_features(xtrain.tolist(), all_words=all_words_in_dataset)
X_test = create_features(xtest.tolist(), all_words=all_words_in_dataset)
print(X_train)
print(X_test)

In [None]:
classifier = MultinomialNB()
classifier.fit(X_train, ytrain)
classifier2=LogisticRegression()
classifier2.fit(X_train, ytrain)
classifier3=SVC()
classifier3.fit(X_train, ytrain)

y_pred = classifier.predict(X_test)
y_pred2 = classifier2.predict(X_test)
y_pred3 = classifier3.predict(X_test)

accuracy = accuracy_score(ytest, y_pred)*100
accuracy2 = accuracy_score(ytest, y_pred2)*100
accuracy3 = accuracy_score(ytest, y_pred3)*100

print("Validation Accuracy:", accuracy,"%") 
print(classification_report(ytest, y_pred))
print("Validation Accuracy2:", accuracy2,"%")
print(classification_report(ytest, y_pred2))
print("Validation Accuracy3:", accuracy3,"%")
print(classification_report(ytest, y_pred3))

In [None]:
numeric_df = df.select_dtypes(include=np.number)
correlation_matrix = numeric_df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5) 
plt.title('Correlation Matrix', fontsize=16, fontweight='bold') 
plt.show()

In [None]:
test_data = pd.DataFrame({'Description': []}) 
X_test_predictions = classifier.predict(X_test) 
test_data['Predicted_Genre'] = X_test_predictions

In [None]:
test_data.to_csv('predicted_genres.csv', index=False)
print(test_data)