<a href="https://colab.research.google.com/github/Rohitprakasam/CODSOFT/blob/main/movie.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import nltk
nltk.download('stopwords')

from sklearn.metrics import accuracy_score
!pip install tensorflow-text

import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from sklearn.metrics.pairwise import cosine_similarity
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
import nltk
import re
from nltk.corpus import stopwords
import tensorflow_hub as hub
import tensorflow_text as text

nltk.download('punkt')
nltk.download('wordnet')

# Load data
train_file_path = '/content/train_data.txt'
test_file_path = '/content/test_data_solution.txt'

with open(train_file_path, 'r', encoding='utf-8') as file:
    train_lines = file.readlines()
with open(test_file_path, 'r', encoding='utf-8') as file:
    test_lines = file.readlines()

# Initialize lists to store data
train_movie_names = []
train_genres = []
train_reviews = []
test_movie_names = []
test_genres = []
test_reviews = []

# Process each line and extract relevant information
for line in train_lines:
    parts = line.split(' ::: ')
    train_movie_names.append(parts[1])
    train_genres.append(parts[2])
    train_reviews.append(parts[3].strip())
for line in test_lines:
    parts = line.split(' ::: ')
    test_movie_names.append(parts[1])
    test_genres.append(parts[2])
    test_reviews.append(parts[3].strip())

# Create DataFrames
df_train = pd.DataFrame({
    'MovieName': train_movie_names,
    'Genre': train_genres,
    'Review': train_reviews
})
df_test = pd.DataFrame({
    'MovieName': test_movie_names,
    'Genre': test_genres,
    'Review': test_reviews
})

# Preprocess text
def preprocess(text):
    text = BeautifulSoup(text, 'html.parser').get_text()
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'\d+', '', text)
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)

    chat_words_map = {'gr8': 'great', 'u': 'you', 'r': 'are', 'lol': 'laughing_out_loud'}
    text = ' '.join(chat_words_map.get(word, word) for word in text.split())

    words = word_tokenize(text)
    words = [chat_words_map.get(word, word) for word in words]

    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    text = ' '.join(words)
    return text

df_train['Review'] = df_train['Review'].apply(preprocess)
df_test['Review'] = df_test['Review'].apply(preprocess)

df_train['Review'].fillna('', inplace=True)
df_test['Review'].fillna('', inplace=True)

# Map genre labels to numerical values
genre_mapping = {}
i=0
for genre in df_train['Genre'].value_counts().index:
    genre_mapping[genre] = i
    i += 1

def map_genre(text):
    return genre_mapping[text]

df_train['Genre'] = df_train['Genre'].apply(map_genre)
df_test['Genre'] = df_test['Genre'].apply(map_genre)

# BERT setup
preprocess_url = "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3"
encoder_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4"
bert_preprocess = hub.KerasLayer(preprocess_url)
bert_encoder = hub.KerasLayer(encoder_url)

# Encode text using BERT
train_text_input = df_train['Review']
test_text_input = df_test['Review']

train_labels = to_categorical(df_train['Genre'], num_classes=len(genre_mapping))
test_labels = to_categorical(df_test['Genre'], num_classes=len(genre_mapping))

# Build the model
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)

# Neural Network Layers
dropout_output = tf.keras.layers.Dropout(0.1, name='dropout')(outputs['pooled_output'])
output_layer = tf.keras.layers.Dense(len(genre_mapping), activation='softmax', name='output')(dropout_output)

# Construct the final model
model = tf.keras.Model(inputs=[text_input], outputs=[output_layer])
metrics = [
    tf.keras.metrics.BinaryAccuracy(name='accuracy'),
    tf.keras.metrics.Precision(name='precision'),
    tf.keras.metrics.Recall(name='recall')
]
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=metrics)

# Train the model
model.fit(train_text_input, train_labels, epochs=10)

# Evaluate the model
model.evaluate(test_text_input, test_labels)

# Predictions
predictions = model.predict(test_text_input)

# Post-process predictions
predictions_binary = predictions.copy()
for i in range(0, len(predictions_binary)):
    max_value = max(predictions_binary[i])
    for j in range(0, len(predictions_binary[i])):
        if predictions_binary[i][j] != max_value:
            predictions_binary[i][j] = 0
        else:
            predictions_binary[i][j] = 1

# Calculate accuracy
accuracy = accuracy_score(test_labels, predictions_binary)
print(f'Accuracy: {accuracy}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
  text = BeautifulSoup(text, 'html.parser').get_text()


Epoch 1/10