In [1]:
pip install pandas numpy scikit-learn

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import json
import pandas as pd
import joblib
import os
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

In [None]:
base_dir = os.getcwd()

input_file_path = os.path.join(base_dir,"Input_Files", "train_data.txt")
output_file_path = os.path.join(base_dir,"Input_Json_Files", "train_dataset.json")

json_data = []

try:
    with open(input_file_path, "r", encoding="utf-8") as file:
        for line in file:
            parts = line.strip().split(" ::: ")
            if len(parts) == 4: 
                entry = {
                    "ID": parts[0],
                    "TITLE": parts[1],
                    "GENRE": parts[2],
                    "DESCRIPTION": parts[3]
                }
                json_data.append(entry)
    
    # Create the necessary directories if they don't exist
    os.makedirs(os.path.dirname(output_file_path), exist_ok=True)

    # Save to a JSON file
    with open(output_file_path, "w", encoding="utf-8") as json_file:
        json.dump(json_data, json_file, indent=4, ensure_ascii=False)

    print(f"Conversion completed! JSON file '{output_file_path}' is ready.")
except FileNotFoundError:
    print(f"Error: The file '{input_file_path}' was not found.")
except Exception as e:
    print(f"An error occurred: {e}")


Conversion completed! JSON file 'd:\Project\Encryptix\Movie Genre Classification\Input_Json_Files\train_dataset.json' is ready.


In [None]:
base_dir = os.getcwd()

input_file_path = os.path.join(base_dir,"Input_Files", "test_data.txt")
output_file_path = os.path.join(base_dir,"Input_Json_Files", "test_dataset.json")

json_data = []

try:
    with open(input_file_path, "r", encoding="utf-8") as file:
        for line in file:
            parts = line.strip().split(" ::: ")
            if len(parts) == 3:
                entry = {
                    "ID": parts[0],
                    "TITLE": parts[1],
                    "DESCRIPTION": parts[2]
                }
                json_data.append(entry)

    # Save to a JSON file
    with open(output_file_path, "w", encoding="utf-8") as json_file:
        json.dump(json_data, json_file, indent=4, ensure_ascii=False)

    print(f"Conversion completed! JSON file '{output_file_path}' is ready.")
    
except FileNotFoundError:
    print(f"Error: The file '{input_file_path}' was not found.")
except Exception as e:
    print(f"An error occurred: {e}")


Conversion completed! JSON file 'd:\Project\Encryptix\Movie Genre Classification\Input_Json_Files\test_dataset.json' is ready.


In [None]:
base_dir = os.getcwd()

input_file_path = os.path.join(base_dir,"Input_Files", "test_data_solution.txt")
output_file_path = os.path.join(base_dir,"Input_Json_Files", "test_dataset_solution.json")

json_data = []

try:
    with open(input_file_path, "r", encoding="utf-8") as file:
        for line in file:
            parts = line.strip().split(" ::: ")
            if len(parts) == 4: 
                entry = {
                    "ID": parts[0],
                    "TITLE": parts[1],
                    "GENRE": parts[2],
                    "DESCRIPTION": parts[3]
                }
                json_data.append(entry)

    # Save to a JSON file
    with open(output_file_path, "w", encoding="utf-8") as json_file:
        json.dump(json_data, json_file, indent=4, ensure_ascii=False)

    print(f"Conversion completed! JSON file '{output_file_path}' is ready.")
    
except FileNotFoundError:
    print(f"Error: The file '{input_file_path}' was not found.")
except Exception as e:
    print(f"An error occurred: {e}")


Conversion completed! JSON file 'd:\Project\Encryptix\Movie Genre Classification\Input_Json_Files\test_dataset_solution.json' is ready.


In [6]:
base_dir = os.getcwd()
test_file_path = os.path.join(base_dir,"Input_Json_Files", "test_dataset.json")
train_file_path = os.path.join(base_dir,"Input_Json_Files", "train_dataset.json")
test_soln_file_path = os.path.join(base_dir,"Input_Json_Files", "test_dataset_solution.json")
# Load JSON files
df_train = pd.read_json(train_file_path)
df_test = pd.read_json(test_file_path)
df_test_sol = pd.read_json(test_soln_file_path)

In [7]:
df_train.head(5)

Unnamed: 0,ID,TITLE,GENRE,DESCRIPTION
0,1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his doc...
1,2,Cupid (1997),thriller,A brother and sister with a past incestuous re...
2,3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fiel...
3,4,The Secret Sin (1915),drama,To help their unemployed father make ends meet...
4,5,The Unrecovered (2007),drama,The film's title refers not only to the un-rec...


In [8]:
df_test.head(5)

Unnamed: 0,ID,TITLE,DESCRIPTION
0,1,Edgar's Lunch (1998),"L.R. Brane loves his life - his car, his apart..."
1,2,La guerra de papá (1977),"Spain, March 1964: Quico is a very naughty chi..."
2,3,Off the Beaten Track (2010),One year in the life of Albin and his family o...
3,4,Meu Amigo Hindu (2015),"His father has died, he hasn't spoken with his..."
4,5,Er nu zhai (1955),Before he was known internationally as a marti...


In [9]:
df_test_sol

Unnamed: 0,ID,TITLE,GENRE,DESCRIPTION
0,1,Edgar's Lunch (1998),thriller,"L.R. Brane loves his life - his car, his apart..."
1,2,La guerra de papá (1977),comedy,"Spain, March 1964: Quico is a very naughty chi..."
2,3,Off the Beaten Track (2010),documentary,One year in the life of Albin and his family o...
3,4,Meu Amigo Hindu (2015),drama,"His father has died, he hasn't spoken with his..."
4,5,Er nu zhai (1955),drama,Before he was known internationally as a marti...
...,...,...,...,...
54195,54196,"""Tales of Light & Dark"" (2013)",horror,"Covering multiple genres, Tales of Light & Dar..."
54196,54197,Der letzte Mohikaner (1965),western,As Alice and Cora Munro attempt to find their ...
54197,54198,Oliver Twink (2007),adult,"A movie 169 years in the making. Oliver Twist,..."
54198,54199,Slipstream (1973),drama,"Popular, but mysterious rock D.J Mike Mallard ..."


In [None]:
def load_data(filepath):
    """
    Load data from a JSON file into a Pandas DataFrame.
    """
    df = pd.read_json(filepath)
    return df

def preprocess_data(df):
    """
    Preprocess the data by dropping rows with missing DESCRIPTION or GENRE.
    """
    df = df.dropna(subset=['DESCRIPTION', 'GENRE'])
    return df

def train_model(train_file):
    """
    Train a text classification model and save the pipeline.
    """
    # Load and preprocess data
    df = load_data(train_file)
    df = preprocess_data(df)

    # Split data into features (DESCRIPTION) and target (GENRE)
    X_train, X_test, y_train, y_test = train_test_split(df['DESCRIPTION'], df['GENRE'], test_size=0.2, random_state=42)

    # Create a pipeline with TF-IDF Vectorizer and Logistic Regression
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(stop_words='english', max_features=5000)),
        ('classifier', LogisticRegression(max_iter=1000, C=1.0))
    ])

    # Train the model
    pipeline.fit(X_train, y_train)

    # Evaluate the model
    y_pred = pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("Model Accuracy:", accuracy)
    print("Classification Report:\n", classification_report(y_test, y_pred, zero_division=0))

    # Ensure Model_Trained directory exists
    model_dir = os.path.join(os.getcwd(), "Model_Trained")
    os.makedirs(model_dir, exist_ok=True)

    # Save the entire pipeline (both vectorizer and model) in Model_Trained folder
    model_output_path = os.path.join(model_dir, "movie_genre_classifier_pipeline.pkl")
    joblib.dump(pipeline, model_output_path)
    print(f"Pipeline saved to '{model_output_path}'.")

if __name__ == "__main__":
    # Path to the training data
    train_file = os.path.join(os.getcwd(),"Input_Json_Files", "train_dataset.json")
    train_model(train_file)


Model Accuracy: 0.5792677303329337
Classification Report:
               precision    recall  f1-score   support

      action       0.53      0.27      0.35       263
       adult       0.71      0.21      0.33       112
   adventure       0.42      0.14      0.21       139
   animation       0.61      0.11      0.18       104
   biography       0.00      0.00      0.00        61
      comedy       0.51      0.58      0.55      1443
       crime       0.43      0.03      0.05       107
 documentary       0.66      0.85      0.74      2659
       drama       0.54      0.78      0.64      2697
      family       0.41      0.07      0.12       150
     fantasy       0.00      0.00      0.00        74
   game-show       0.94      0.42      0.59        40
     history       0.00      0.00      0.00        45
      horror       0.63      0.56      0.59       431
       music       0.62      0.47      0.54       144
     musical       1.00      0.02      0.04        50
     mystery       0.0

In [None]:
def load_model(model_path):
    """
    Load the trained pipeline from the saved file.
    """
    try:
        pipeline = joblib.load(model_path)
        print("Model loaded successfully.")
        return pipeline
    except Exception as e:
        print(f"Error loading model: {e}")
        return None

def predict_genre(model, input_data):
    """
    Predict the genre of a movie based on its DESCRIPTION.
    """
    try:
        # Extract the DESCRIPTION from input data
        description = input_data["DESCRIPTION"]

        # Predict genre using the model
        predicted_genre = model.predict([description])[0]

        # Return the input data with the predicted genre
        result = {
            "ID": input_data["ID"],
            "TITLE": input_data["TITLE"],
            "DESCRIPTION": input_data["DESCRIPTION"],
            "PREDICTED_GENRE": predicted_genre
        }
        return result
    except Exception as e:
        print(f"Error predicting genre: {e}")
        return None

def validate_model(model_path, input_file):
    """
    Validate the model by predicting genres for input data.
    """
    # Load the trained model
    model = load_model(model_path)
    if not model:
        return

    # Load input data
    try:
        with open(input_file, 'r', encoding='utf-8') as file:
            input_data = json.load(file)
        print("Input data loaded successfully.")
    except Exception as e:
        print(f"Error loading input data: {e}")
        return

    # Predict genres for each movie in the input data
    results = []
    for item in input_data:
        result = predict_genre(model, item)
        if result:
            results.append(result)

    # Ensure the Output directory exists
    output_dir = os.path.join(os.getcwd(), "Output")
    os.makedirs(output_dir, exist_ok=True)

    # Save the results to the Output directory
    output_file = os.path.join(output_dir, "validation_results.json")
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(results, f, indent=4)
    print(f"Validation results saved to '{output_file}'.")


if __name__ == "__main__":
    # Path to the trained model and input data
    model_path = os.path.join(os.getcwd(),"Model_Trained", "movie_genre_classifier_pipeline.pkl")
    input_file = os.path.join(os.getcwd(),"Input_Json_Files", "test_dataset.json")

    validate_model(model_path, input_file)


Model loaded successfully.
Input data loaded successfully.
Validation results saved to 'd:\Project\Encryptix\Movie Genre Classification\Output\validation_results.json'.


In [None]:
def load_model(model_path):
    """
    Load the trained pipeline from the saved file.
    """
    try:
        pipeline = joblib.load(model_path)
        print("Model loaded successfully.")
        return pipeline
    except Exception as e:
        print(f"Error loading model: {e}")
        return None

def predict_genre(model, description):
    """
    Predict the genre of a movie based on its DESCRIPTION.
    """
    try:
        # Predict genre using the model
        predicted_genre = model.predict([description])[0]

        # Return the predicted genre
        return predicted_genre
    except Exception as e:
        print(f"Error predicting genre: {e}")
        return None

def get_movie_description():
    """
    Prompt the user to input a movie description.
    """
    print("Please enter the description of the movie:")
    description = input()  # Takes user input as description
    return description

def predict_genre_from_input(model_path):
    """
    Load the model and predict the genre for an individual movie description.
    """
    # Load the trained model
    model = load_model(model_path)
    if not model:
        return

    # Get movie description from user input
    description = get_movie_description()

    # Predict genre
    predicted_genre = predict_genre(model, description)

    # Display the result
    if predicted_genre:
        print(f"The predicted genre for the movie is: {predicted_genre}")
    else:
        print("Unable to predict genre.")

if __name__ == "__main__":
    # Path to the trained model
    model_path = os.path.join(os.getcwd(), "Model_Trained", "movie_genre_classifier_pipeline.pkl")
    
    predict_genre_from_input(model_path)


Model loaded successfully.
Please enter the description of the movie:
The predicted genre for the movie is: short
