In [1]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

# Define file paths
train_data_path = 'train_data.txt'
test_data_path = 'test_data.txt'
test_solution_path = 'test_data_solution.txt'

def load_data(file_path, has_plot=True):
    # Load data with ' ::: ' separator
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            parts = line.strip().split(' ::: ')
            if has_plot and len(parts) == 4:
                data.append(parts)
            elif not has_plot and len(parts) == 3:
                data.append(parts)
            else:
                print(f"Warning: Skipping line due to incorrect format: {line}")
    columns = ['id', 'title', 'genre', 'plot'] if has_plot else ['id', 'title', 'genre']
    return pd.DataFrame(data, columns=columns)

# Load the data
try:
    train_data = load_data(train_data_path)
    test_data = load_data(test_data_path, has_plot=False)
    test_solution = load_data(test_solution_path)
except FileNotFoundError as e:
    print(f"Error: {e}")
    raise

# Check if data is loaded correctly
print("Train Data Sample:")
print(train_data.head())
print("\nTest Data Sample:")
print(test_data.head())
print("\nTest Solution Sample:")
print(test_solution.head())

# Data cleaning
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)  # Remove digits
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text

# Apply preprocessing
train_data['processed_plot'] = train_data['plot'].apply(preprocess_text)

# Check the processed data
print("\nProcessed Train Data Sample:")
print(train_data[['processed_plot']].head())

# TF-IDF Vectorization and Logistic Regression Pipeline
pipeline_lr = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=10000)),
    ('lr', LogisticRegression(max_iter=1000))
])

# Fit the pipeline on the training data
pipeline_lr.fit(train_data['processed_plot'], train_data['genre'])

# Predict on the validation set (example usage)
# Assume X_val and y_val are defined if you have a validation set
# y_pred_lr = pipeline_lr.predict(X_val)
# print("\nClassification Report with Logistic Regression:")
# print(classification_report(y_val, y_pred_lr))

# Function to predict genre from user input
def predict_genre(plot):
    processed_plot = preprocess_text(plot)
    prediction = pipeline_lr.predict([processed_plot])
    return prediction[0]

# Example usage
if __name__ == "__main__":
    user_plot = input("Enter a plot to predict its genre: ")
    genre_prediction = predict_genre(user_plot)
    print(f"The predicted genre is: {genre_prediction}")


Train Data Sample:
  id                             title     genre  \
0  1      Oscar et la dame rose (2009)     drama   
1  2                      Cupid (1997)  thriller   
2  3  Young, Wild and Wonderful (1980)     adult   
3  4             The Secret Sin (1915)     drama   
4  5            The Unrecovered (2007)     drama   

                                                plot  
0  Listening in to a conversation between his doc...  
1  A brother and sister with a past incestuous re...  
2  As the bus empties the students for their fiel...  
3  To help their unemployed father make ends meet...  
4  The film's title refers not only to the un-rec...  

Test Data Sample:
  id                        title  \
0  1         Edgar's Lunch (1998)   
1  2     La guerra de papá (1977)   
2  3  Off the Beaten Track (2010)   
3  4       Meu Amigo Hindu (2015)   
4  5            Er nu zhai (1955)   

                                               genre  
0  L.R. Brane loves his life - his car, h

Enter a plot to predict its genre:  A man return home, only to find that his house is haunted and evil forces try to kill him.


The predicted genre is: horror
