In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib


In [None]:
import pandas as pd
import re
from sklearn.preprocessing import LabelEncoder

# Paths to datasets
train_file = 'train_data.txt'
test_file = 'test_data.txt'
solution_file = 'test_data_solution.txt'


In [None]:
# Define column names
column_names = ['ID', 'Title', 'Genre', 'Plot']

# Load data with specified delimiters
train_data = pd.read_csv(train_file, delimiter=':::', engine='python', names=column_names)
test_data = pd.read_csv(test_file, delimiter=':::', engine='python', names=column_names)
solutions_data = pd.read_csv(solution_file, delimiter=':::', engine='python', names=column_names)

# Display the first few rows of each dataset
print("Train Data:")
print(train_data.head())

print("\nTest Data:")
print(test_data.head())

print("\nSolutions Data:")
print(solutions_data.head())


Train Data:
   ID                               Title       Genre  \
0   1       Oscar et la dame rose (2009)       drama    
1   2                       Cupid (1997)    thriller    
2   3   Young, Wild and Wonderful (1980)       adult    
3   4              The Secret Sin (1915)       drama    
4   5             The Unrecovered (2007)       drama    

                                                Plot  
0   Listening in to a conversation between his do...  
1   A brother and sister with a past incestuous r...  
2   As the bus empties the students for their fie...  
3   To help their unemployed father make ends mee...  
4   The film's title refers not only to the un-re...  

Test Data:
   ID                          Title  \
0   1          Edgar's Lunch (1998)    
1   2      La guerra de papá (1977)    
2   3   Off the Beaten Track (2010)    
3   4        Meu Amigo Hindu (2015)    
4   5             Er nu zhai (1955)    

                                               Genre  Plot  
0

In [None]:
# Text preprocessing function
def clean_text(text):
    text = re.sub(r'\W', ' ', text)  # Remove non-alphanumeric characters
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    return text.strip()


In [None]:
# Updated text preprocessing function
def clean_text(text):
    if pd.isnull(text):  # Handle NaN or null values
        return ''
    text = str(text)  # Ensure the input is a string
    text = re.sub(r'\W', ' ', text)  # Remove non-alphanumeric characters
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    return text.strip()

# Apply text cleaning to the 'Plot' column
train_data['Clean_Plot'] = train_data['Plot'].apply(clean_text)
test_data['Clean_Plot'] = test_data['Plot'].apply(clean_text)
solutions_data['Clean_Plot'] = solutions_data['Plot'].apply(clean_text)

# Display a sample of processed data
print(train_data[['ID', 'Clean_Plot']].head())
print(test_data[['ID', 'Clean_Plot']].head())
print(solutions_data[['ID', 'Clean_Plot']].head())


   ID                                         Clean_Plot
0   1  listening in to a conversation between his doc...
1   2  a brother and sister with a past incestuous re...
2   3  as the bus empties the students for their fiel...
3   4  to help their unemployed father make ends meet...
4   5  the film s title refers not only to the un rec...
   ID Clean_Plot
0   1           
1   2           
2   3           
3   4           
4   5           
   ID                                         Clean_Plot
0   1  l r brane loves his life his car his apartment...
1   2  spain march 1964 quico is a very naughty child...
2   3  one year in the life of albin and his family o...
3   4  his father has died he hasn t spoken with his ...
4   5  before he was known internationally as a marti...


In [None]:
encoder = LabelEncoder()
train_data['genre_encoded'] = encoder.fit_transform(train_data['Genre'])

In [None]:
tfidf = TfidfVectorizer(max_features=5000)  # You can adjust the max_features

# Fit on training data and transform
X_train_tfidf = tfidf.fit_transform(train_data['Clean_Plot'])
X_test_tfidf = tfidf.transform(test_data['Clean_Plot'])

# Target variable
y_train = train_data['genre_encoded']

In [None]:
"""# Initialize models
models = {
    'Naive Bayes': MultinomialNB(),
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'SVM': SVC(kernel='linear', random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42)
}

# Train and evaluate models
accuracies = {}
for model_name, model in models.items():
    model.fit(X_train_tfidf, y_train)
    test_predictions = model.predict(X_test_tfidf)
    true_genres = encoder.transform(solutions_data['Genre'])
    accuracy = accuracy_score(true_genres, test_predictions)
    accuracies[model_name] = accuracy
    print(f"Model: {model_name}\nAccuracy: {accuracy:.4f}\n")
    print(classification_report(true_genres, test_predictions, target_names=encoder.classes_))

# Report accuracies
print("\nSummary of Model Accuracies:")
for model_name, accuracy in accuracies.items():
    print(f"{model_name}: {accuracy:.4f}")"""

'# Initialize models\nmodels = {\n    \'Naive Bayes\': MultinomialNB(),\n    \'Logistic Regression\': LogisticRegression(max_iter=1000, random_state=42),\n    \'SVM\': SVC(kernel=\'linear\', random_state=42),\n    \'Random Forest\': RandomForestClassifier(n_estimators=100, random_state=42)\n}\n\n# Train and evaluate models\naccuracies = {}\nfor model_name, model in models.items():\n    model.fit(X_train_tfidf, y_train)\n    test_predictions = model.predict(X_test_tfidf)\n    true_genres = encoder.transform(solutions_data[\'Genre\'])\n    accuracy = accuracy_score(true_genres, test_predictions)\n    accuracies[model_name] = accuracy\n    print(f"Model: {model_name}\nAccuracy: {accuracy:.4f}\n")\n    print(classification_report(true_genres, test_predictions, target_names=encoder.classes_))\n\n# Report accuracies\nprint("\nSummary of Model Accuracies:")\nfor model_name, accuracy in accuracies.items():\n    print(f"{model_name}: {accuracy:.4f}")'

In [None]:
 from sklearn.ensemble import RandomForestClassifier

# Initialize and train Random Forest model
random_forest_model = RandomForestClassifier(n_estimators=100, random_state=42)
random_forest_model.fit(X_train_tfidf, y_train)


In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Predict on test data
true_genres = encoder.transform(solutions['genre'])
test_predictions = random_forest_model.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(true_genres, test_predictions)
print(f"Random Forest Accuracy: {accuracy:.4f}\n")
print("Classification Report:")
print(classification_report(true_genres, test_predictions, target_names=encoder.classes_))

# Confusion Matrix
conf_matrix = confusion_matrix(true_genres, test_predictions)
print("Confusion Matrix:")
print(conf_matrix)


In [None]:
import joblib

# Save the model and vectorizer for future use
joblib.dump(random_forest_model, 'random_forest_model.pkl')
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')
print("Model and vectorizer saved successfully.")


Model and vectorizer saved successfully.


In [None]:
# Load the saved model and vectorizer
loaded_model = joblib.load('random_forest_model.pkl')
loaded_vectorizer = joblib.load('tfidf_vectorizer.pkl')

# Example new movie plot
new_plot = "A brother and sister with a past incestuous relationship have a current murderous relationship. He murders the women who reject him and she murders the women who get too close to him."
cleaned_plot = clean_text(new_plot)  # Clean the text
transformed_plot = loaded_vectorizer.transform([cleaned_plot])  # Transform to TF-IDF

# Predict the genre
predicted_genre_encoded = loaded_model.predict(transformed_plot)
predicted_genre = encoder.inverse_transform(predicted_genre_encoded)
print(f"Predicted Genre: {predicted_genre[0]}")


Predicted Genre:  thriller 
