## Training the Model

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer

# Load the training data
train_data = pd.read_csv('train_data.txt', sep=' ::: ', engine='python', names=['ID', 'TITLE', 'GENRE', 'DESCRIPTION'])

# Load the test data
test_data = pd.read_csv('test_data.txt', sep=' ::: ', engine='python', names=['ID', 'TITLE', 'DESCRIPTION'])

# Preprocess the data
X_train = train_data['DESCRIPTION']
y_train = train_data['GENRE']
X_test = test_data['DESCRIPTION']

# Split the training data for validation purposes
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Create a pipeline with TF-IDF vectorizer and Logistic Regression classifier
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('clf', LogisticRegression(max_iter=1000))
])

# Train the model
pipeline.fit(X_train_split, y_train_split)

# Make predictions on the validation data
y_val_pred = pipeline.predict(X_val_split)

# Evaluate the model
print(classification_report(y_val_split, y_val_pred))

# Make predictions on the test data
test_predictions = pipeline.predict(X_test)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

      action       0.55      0.21      0.30       263
       adult       0.84      0.19      0.31       112
   adventure       0.47      0.10      0.17       139
   animation       0.75      0.03      0.06       104
   biography       0.00      0.00      0.00        61
      comedy       0.53      0.58      0.55      1443
       crime       0.33      0.01      0.02       107
 documentary       0.65      0.86      0.74      2659
       drama       0.53      0.81      0.64      2697
      family       0.47      0.05      0.10       150
     fantasy       0.00      0.00      0.00        74
   game-show       0.92      0.30      0.45        40
     history       0.00      0.00      0.00        45
      horror       0.69      0.57      0.62       431
       music       0.69      0.38      0.49       144
     musical       0.00      0.00      0.00        50
     mystery       0.00      0.00      0.00        56
        news       0.00    

## Exporting the model

In [2]:
import joblib

# Assuming 'pipeline' is your trained model pipeline from the previous code

# Define a filename for your model
model_filename = 'imdb_genre_classifier.pkl'

# Save the model to a file using joblib
joblib.dump(pipeline, model_filename)

print(f"Model saved as {model_filename}")


Model saved as imdb_genre_classifier.pkl


## Predicting the test data

In [3]:
import pandas as pd
import joblib

# Load the test data
test_data = pd.read_csv('test_data.txt', sep=' ::: ', engine='python', names=['ID', 'TITLE', 'DESCRIPTION'])

# Load the trained model
model_filename = 'imdb_genre_classifier.pkl'
loaded_model = joblib.load(model_filename)

# Extract descriptions from test data
X_test = test_data['DESCRIPTION']

# Make predictions using the loaded model
predictions = loaded_model.predict(X_test)

# Prepare the output DataFrame
output = pd.DataFrame({'ID': test_data['ID'], 'TITLE': test_data['TITLE'], 'PREDICTED_GENRE': predictions})

# Save the predictions to a CSV file
output.to_csv('test_predictions.csv', index=False)

print("Predictions saved to test_predictions.csv")


Predictions saved to test_predictions.csv


In [4]:
test_predictions

array(['comedy', 'drama', 'documentary', ..., 'drama', 'drama',
       'documentary'], dtype=object)