In [4]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

nltk.download('punkt')
nltk.download('stopwords')


train_df = pd.read_csv('movieTrainData.txt', delimiter=' ::: ', engine='python', names=['ID', 'TITLE', 'GENRE', 'DESCRIPTION'])
test_df = pd.read_csv('movieTestData.txt', delimiter=' ::: ', engine='python', names=['ID', 'TITLE', 'DESCRIPTION'])

def preprocess_text(text):
    # Tokenize
    tokens = word_tokenize(text.lower())
    # Remove stop words and non-alphabetic characters
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    # Join the tokens back into a single string
    return ' '.join(tokens)

# train_df['CLEANED_DESCRIPTION'] = train_df['DESCRIPTION'].apply(preprocess_text)
# test_df['CLEANED_DESCRIPTION'] = test_df['DESCRIPTION'].apply(preprocess_text)

# Display the first few rows of the processed training data
print(train_df[['ID', 'TITLE', 'GENRE', 'DESCRIPTION', 'CLEANED_DESCRIPTION']].head())

# Display the first few rows of the processed test data
print(test_df[['ID', 'TITLE', 'DESCRIPTION', 'CLEANED_DESCRIPTION']].head())

# train_df.to_csv('preprocessed_train_data.csv', index=False)
# test_df.to_csv('preprocessed_test_data.csv', index=False)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\srinj\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\srinj\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


   ID                             TITLE     GENRE  \
0   1      Oscar et la dame rose (2009)     drama   
1   2                      Cupid (1997)  thriller   
2   3  Young, Wild and Wonderful (1980)     adult   
3   4             The Secret Sin (1915)     drama   
4   5            The Unrecovered (2007)     drama   

                                         DESCRIPTION  \
0  Listening in to a conversation between his doc...   
1  A brother and sister with a past incestuous re...   
2  As the bus empties the students for their fiel...   
3  To help their unemployed father make ends meet...   
4  The film's title refers not only to the un-rec...   

                                 CLEANED_DESCRIPTION  
0  listening conversation doctor parents oscar le...  
1  brother sister past incestuous relationship cu...  
2  bus empties students field trip museum natural...  
3  help unemployed father make ends meet edith tw...  
4  film title refers bodies ground zero also stat...  
   ID         

In [5]:
import pandas as pd

# Load the preprocessed data
train_df = pd.read_csv('preprocessed_train_data.csv')
test_df = pd.read_csv('preprocessed_test_data.csv')

# If you used pickle
# train_df = pd.read_pickle('preprocessed_train_data.pkl')
# test_df = pd.read_pickle('preprocessed_test_data.pkl')

# Display the first few rows to verify
print(train_df.head())
print(test_df.head())


   ID                             TITLE     GENRE  \
0   1      Oscar et la dame rose (2009)     drama   
1   2                      Cupid (1997)  thriller   
2   3  Young, Wild and Wonderful (1980)     adult   
3   4             The Secret Sin (1915)     drama   
4   5            The Unrecovered (2007)     drama   

                                         DESCRIPTION  \
0  Listening in to a conversation between his doc...   
1  A brother and sister with a past incestuous re...   
2  As the bus empties the students for their fiel...   
3  To help their unemployed father make ends meet...   
4  The film's title refers not only to the un-rec...   

                                 CLEANED_DESCRIPTION  
0  listening conversation doctor parents oscar le...  
1  brother sister past incestuous relationship cu...  
2  bus empties students field trip museum natural...  
3  help unemployed father make ends meet edith tw...  
4  film title refers bodies ground zero also stat...  
   ID         

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=5000)

# Fit the vectorizer on the training data and transform the training data
X_train = vectorizer.fit_transform(train_df['CLEANED_DESCRIPTION']).toarray()

# Transform the test data (note: we use transform, not fit_transform)
X_test = vectorizer.transform(test_df['CLEANED_DESCRIPTION']).toarray()

# Get the target variable (genre) from the training data
y_train = train_df['GENRE']

# Display the shape of the resulting TF-IDF matrices
print(f'X_train shape: {X_train.shape}')
print(f'X_test shape: {X_test.shape}')


X_train shape: (54214, 5000)
X_test shape: (54200, 5000)


In [7]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report

# Create a pipeline that combines the TF-IDF vectorizer and a Naive Bayes classifier
model = make_pipeline(TfidfVectorizer(max_features=5000), MultinomialNB())

# Train the model on the training data
model.fit(train_df['CLEANED_DESCRIPTION'], train_df['GENRE'])

# Make predictions on the test set
predictions = model.predict(test_df['CLEANED_DESCRIPTION'])

# Add predictions to the test DataFrame
test_df['PREDICTED_GENRE'] = predictions

# Display the test DataFrame with predictions
print(test_df[['ID', 'TITLE', 'DESCRIPTION', 'PREDICTED_GENRE']].head())

# If you have true labels for the test data, evaluate the model
# (Uncomment and adjust the following lines if you have ground truth for the test data)
# true_labels = test_df['GENRE']  # Replace 'GENRE' with the actual column name if it exists
# print(classification_report(true_labels, predictions))


   ID                        TITLE  \
0   1         Edgar's Lunch (1998)   
1   2     La guerra de papá (1977)   
2   3  Off the Beaten Track (2010)   
3   4       Meu Amigo Hindu (2015)   
4   5            Er nu zhai (1955)   

                                         DESCRIPTION PREDICTED_GENRE  
0  L.R. Brane loves his life - his car, his apart...           drama  
1  Spain, March 1964: Quico is a very naughty chi...           drama  
2  One year in the life of Albin and his family o...     documentary  
3  His father has died, he hasn't spoken with his...           drama  
4  Before he was known internationally as a marti...           drama  


In [8]:
import pandas as pd

# Load the test data solution file
test_solution_file = 'test_data_solution.txt'  # Replace with your actual file path
test_solution_df = pd.read_csv(test_solution_file, delimiter=' ::: ', engine='python', names=['ID', 'TITLE', 'GENRE', 'DESCRIPTION'])


# Ensure the IDs in both DataFrames match
print(test_solution_df['ID'].head())
print(test_df['ID'].head())


0    1
1    2
2    3
3    4
4    5
Name: ID, dtype: int64
0    1
1    2
2    3
3    4
4    5
Name: ID, dtype: int64


In [11]:
merged_df = pd.merge(test_solution_df, test_df[['ID', 'PREDICTED_GENRE']], on='ID')

# Display the merged DataFrame
print(merged_df[['ID', 'TITLE', 'GENRE', 'PREDICTED_GENRE']].head())

# Evaluate the model
true_labels = merged_df['GENRE']
predicted_labels = merged_df['PREDICTED_GENRE']
print(classification_report(true_labels, predicted_labels))

   ID                        TITLE        GENRE PREDICTED_GENRE
0   1         Edgar's Lunch (1998)     thriller           drama
1   2     La guerra de papá (1977)       comedy           drama
2   3  Off the Beaten Track (2010)  documentary     documentary
3   4       Meu Amigo Hindu (2015)        drama           drama
4   5            Er nu zhai (1955)        drama           drama


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

      action       0.57      0.10      0.18      1314
       adult       0.45      0.05      0.09       590
   adventure       0.75      0.06      0.12       775
   animation       0.00      0.00      0.00       498
   biography       0.00      0.00      0.00       264
      comedy       0.51      0.42      0.46      7446
       crime       0.00      0.00      0.00       505
 documentary       0.57      0.87      0.69     13096
       drama       0.46      0.83      0.59     13612
      family       1.00      0.00      0.01       783
     fantasy       0.00      0.00      0.00       322
   game-show       0.97      0.30      0.46       193
     history       0.00      0.00      0.00       243
      horror       0.70      0.35      0.46      2204
       music       0.76      0.15      0.26       731
     musical       0.00      0.00      0.00       276
     mystery       0.00      0.00      0.00       318
        news       0.00    

  _warn_prf(average, modifier, msg_start, len(result))


In [13]:
import joblib

# Save the trained model to a file
joblib.dump(model, 'movieGenreModel.pkl')

['movieGenreModel.pkl']

In [4]:
import joblib
from nltk.tokenize import word_tokenize

def preprocess_text(text):
    # Tokenize
    tokens = word_tokenize(text.lower())
    # Remove stop words and non-alphabetic characters
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    # Join the tokens back into a single string
    return ' '.join(tokens)

model = joblib.load('movieGenreModel.pkl')

# Example new description
new_description = "A group of friends embarks on a weekend getaway to a remote cabin in the woods. As night falls, they begin to experience eerie occurrences and unsettling sounds. What starts as a fun retreat quickly turns into a nightmare as they realize they are not alone. An ancient evil, awakened from its slumber, hunts them one by one, and their only hope for survival is to uncover the dark secrets of the cabin before it's too late."

# Preprocess the new description
cleaned_description = preprocess_text(new_description)

# Use the model to predict the genre
predicted_genre = model.predict([cleaned_description])

print(f"The predicted genre for the given description is: {predicted_genre[0]}")

NameError: name 'stopwords' is not defined