## Data Pre Processing


In [4]:
import pandas as pd

# Load the dataset from a text document file
file_path = "D:\\CodSoft\\MovieGenre\\train_data.txt"
# Assuming each line in the text document represents a movie entry with columns separated by ':::'
# Adjust the separator based on the actual format of your dataset
df = pd.read_csv(file_path, sep=':::', header=None, names=[ 'Title', 'Genre', 'Plot'])

# Data Cleaning
# Check for missing values
mm=df.isnull()
print(mm)
print("Missing values:\n", df.isnull().sum())

# Handle missing values
# Example: Replace missing values in 'Plot' column with an empty string
#df['Plot'].fillna('', inplace=True)

# # Feature Engineering
# # Tokenize and vectorize the plot summaries
# # Example: Use CountVectorizer to tokenize and convert plot summaries into numerical representations
# from sklearn.feature_extraction.text import CountVectorizer

# vectorizer = CountVectorizer()
# X_plot = vectorizer.fit_transform(df['Plot'])

# # Convert genre labels into numerical representations using label encoding
# # Label encoder is used to transform categorical labels into numerical labels
# from sklearn.preprocessing import LabelEncoder
# label_encoder = LabelEncoder()
# y_genre = label_encoder.fit_transform(df['Genre'])





  df = pd.read_csv(file_path, sep=':::', header=None, names=[ 'Title', 'Genre', 'Plot'])


       Title  Genre   Plot
1      False  False  False
2      False  False  False
3      False  False  False
4      False  False  False
5      False  False  False
...      ...    ...    ...
54210  False  False  False
54211  False  False  False
54212  False  False  False
54213  False  False  False
54214  False  False  False

[54214 rows x 3 columns]
Missing values:
 Title    0
Genre    0
Plot     0
dtype: int64


## Feature Extraction


In [2]:
"""""""
# Feature Engineering
# Tokenize and vectorize the plot summaries
from sklearn.feature_extraction.text import CountVectorizer

# Initialize CountVectorizer
vectorizer = CountVectorizer()

# Tokenize and convert plot summaries into numerical representations
X_plot = vectorizer.fit_transform(df['Plot'])

# Now, X_plot contains the vectorized plot summaries.
# Each row represents a movie, and each column represents a unique word from the plot summaries.
# The cell value represents the frequency of the word in the corresponding movie's plot summary.

# Convert genre labels into numerical representations using label encoding
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_genre = label_encoder.fit_transform(df['Genre'])

""""""

## SPLITTING DATA 

In [3]:
# Split Data into Training and Testing Sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_plot, y_genre, test_size=0.2, random_state=42)

# Now, X_train, X_test contain the vectorized plot summaries, and y_train, y_test contain the genre labels.

# Data is now preprocessed and ready for model training.


## Importing Class


In [4]:
from sklearn.ensemble import VotingClassifier


## Model Training

In [5]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier

# Define a pipeline for model training (without preprocessing)
pipeline = Pipeline([
    ('classifier', VotingClassifier(estimators=[
        ('naive_bayes', MultinomialNB()),
        ('decision_tree', DecisionTreeClassifier())
    ], voting='soft'))  # Soft voting for probabilistic predictions
])

# Define the hyperparameters to tune (reduced ranges)
parameters = {
    'classifier__naive_bayes__alpha': [0.1, 0.5],  # Alpha parameter for Multinomial Naive Bayes
    'classifier__decision_tree__max_depth': [None, 10],  # Max depth for Decision Tree
    'classifier__weights': [(1, 1)]  # Voting weights for the classifiers
}

# Perform grid search cross-validation with reduced folds (e.g., cv=3)
grid_search = GridSearchCV(pipeline, parameters, cv=3, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)    #Model Trained in the name of grid_search.fit

# Display the best parameters and their corresponding accuracy
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Accuracy:", grid_search.best_score_)




Fitting 3 folds for each of 4 candidates, totalling 12 fits
Best Parameters: {'classifier__decision_tree__max_depth': 10, 'classifier__naive_bayes__alpha': 0.1, 'classifier__weights': (1, 1)}
Best Cross-Validation Accuracy: 0.5548869059970948


## Model Evaluation

In [6]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Assuming you have already trained your model(s) and obtained predictions on the test set

# Evaluate the model(s) on the test set
y_pred = grid_search.predict(X_test)  # Replace 'grid_search' with your trained model object
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
conf_matrix = confusion_matrix(y_test, y_pred)

# Print the evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Confusion Matrix:\n", conf_matrix)


Accuracy: 0.5603615235635894
Precision: 0.534200042093326
Recall: 0.5603615235635894
F1 Score: 0.5414306823107056
Confusion Matrix:
 [[ 100    1    6    1    1   13    3   20   55    0    0    0    0   10
     0    0    0    0    1    3    9    5    9    0   25    0    1]
 [   1   43   11    0    0   24    0    3   17    0    0    0    0    4
     1    0    0    0    1    0    0    5    0    0    1    0    1]
 [   8    9   26    1    0    9    1   21   16    1    0    0    0   12
     0    0    0    0    2    0    7    8    1    0    7    1    9]
 [   5    0    5   13    0   12    0   13   11   13    2    0    0    4
     2    0    0    0    0    1    9   10    0    1    3    0    0]
 [   0    0    0    0    0    2    0   35   11    0    1    0    1    0
     2    0    0    0    0    0    0    7    1    0    0    1    0]
 [  15    2    7    4    0  784    3   77  322   14    3    0    0   22
    22    8    0    3   22   19    5   62    3   12   25    2    7]
 [  13    0    0    0    0 