In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder

# Load the CSV file using the file path uploaded
data = pd.read_csv("output.csv")

# Preprocess the data to check redundancy
data = data.dropna(subset=['Genre', 'Description'])
label_encoder = LabelEncoder()
data['Genre_encoded'] = label_encoder.fit_transform(data['Genre'])

# Prepare the text descriptions and labels
X = data['Description']
y = data['Genre_encoded']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# TF-IDF Vectorization for words classification
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Model Training
logistic_classifier = LogisticRegression(random_state=42, max_iter=1000)
logistic_classifier.fit(X_train_tfidf, y_train)

# Function to predict genre and its probability for a new description
def predict_genre(description):
    description_tfidf = tfidf_vectorizer.transform([description])
    prediction = logistic_classifier.predict(description_tfidf)
    probabilities = logistic_classifier.predict_proba(description_tfidf)
    predicted_genre = label_encoder.inverse_transform(prediction)[0]
    predicted_probability = probabilities[0][prediction][0]

    return predicted_genre, predicted_probability

# Example usage
new_description = input("Enter the description: ")
predicted_genre, predicted_probability = predict_genre(new_description)
print(f"\nPredicted Genre: {predicted_genre}")
print(f"Probability: {predicted_probability:.2f}")

Enter the description: In "Voices of the Deep," dive into the uncharted territories of the world's oceans to uncover the hidden lives of the extraordinary creatures that inhabit the depths. This documentary follows a team of marine biologists, oceanographers, and filmmakers as they embark on a groundbreaking expedition to explore underwater ecosystems that have never been captured on film before. From bioluminescent creatures in the midnight zone to the ancient, colossal squids lurking in the abyss, witness the marvels of marine life that defy imagination. The documentary also highlights the crucial role these ecosystems play in the planet's health and the urgent need for their conservation amidst the growing threats of climate change and human activity. Through stunning underwater cinematography and expert insights, "Voices of the Deep" offers a mesmerizing and educational journey into the heart of the ocean, revealing its mysteries and the efforts to protect its future.

Predicted Ge

In [None]:
import csv
import re

# Input and output file paths
input_file = 'train_data.txt'
output_file = 'output.csv'

# Function to parse each line of text and extract details
def parse_line(line):
    pattern = r'(\d+) ::: (.+?) \((\d{4})\) ::: (.+?) ::: (.+)'
    match = re.match(pattern, line)
    if match:
        return match.groups()
    return None

# Read the text file and parse each line
data = []
with open(input_file, 'r') as file:
    for line in file:
        parsed = parse_line(line.strip())
        if parsed:
            data.append(parsed)

# Write parsed data to CSV
with open(output_file, 'w', newline='') as csvfile:
    csvwriter = csv.writer(csvfile)
    # Write header
    csvwriter.writerow(['ID', 'Title', 'Year', 'Genre', 'Description'])
    # Write data rows
    csvwriter.writerows(data)

print(f"Data successfully written to {output_file}")