<a href="https://colab.research.google.com/github/Partheban-M/Movie-genre-prediction/blob/main/project_movie_genre.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from google.colab import files
import io

uploaded_file_name = "train_data.txt"

try:
    uploaded = files.upload()

    actual_uploaded_file_name = list(uploaded.keys())[0]
    train_data_path = io.StringIO(uploaded[actual_uploaded_file_name].decode('utf-8'))

    train_records = []
    # Read the data line by line
    for line in train_data_path:
        parts = line.strip().split(" ::: ")
        if len(parts) == 4:
            movie_id, title, genre, plot = parts
            train_records.append((title, genre, plot))


    train_df = pd.DataFrame(train_records, columns=["Title", "Genre", "Plot"])
    print(train_df.head())

except KeyError:
    print(f"Error: File '{uploaded_file_name}' not found in uploaded files. Please upload the correct file.")
except Exception as e:
    print(f"An error occurred: {e}")

Saving train_data.txt to train_data.txt
Saving test_data_solution.txt to test_data_solution.txt
Saving test_data.txt to test_data.txt
Saving description.txt to description.txt
                              Title     Genre  \
0      Oscar et la dame rose (2009)     drama   
1                      Cupid (1997)  thriller   
2  Young, Wild and Wonderful (1980)     adult   
3             The Secret Sin (1915)     drama   
4            The Unrecovered (2007)     drama   

                                                Plot  
0  Listening in to a conversation between his doc...  
1  A brother and sister with a past incestuous re...  
2  As the bus empties the students for their fiel...  
3  To help their unemployed father make ends meet...  
4  The film's title refers not only to the un-rec...  


In [None]:
import re
import nltk
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('stopwords')
from nltk.corpus import stopwords


stop_words = set(stopwords.words('english'))
def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text.lower())
    return ' '.join([word for word in text.split() if word not in stop_words])


if 'train_df' in globals():
    train_df["Clean_Plot"] = train_df["Plot"].apply(clean_text)


    le = LabelEncoder()
    train_df["Encoded_Genre"] = le.fit_transform(train_df["Genre"])
else:
    print("Error: train_df is not defined. Please run the previous cell to load the data.")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer # Import TfidfVectorizer


tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X = tfidf_vectorizer.fit_transform(train_df["Clean_Plot"])
y = train_df["Encoded_Genre"]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)


y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
if 'le' in globals():
    print(classification_report(y_test, y_pred, target_names=le.classes_))
else:
    print(classification_report(y_test, y_pred))

Accuracy: 0.577238771557687
              precision    recall  f1-score   support

      action       0.49      0.25      0.33       263
       adult       0.81      0.22      0.35       112
   adventure       0.43      0.14      0.21       139
   animation       0.67      0.10      0.17       104
   biography       0.00      0.00      0.00        61
      comedy       0.51      0.59      0.55      1443
       crime       0.20      0.01      0.02       107
 documentary       0.67      0.84      0.74      2659
       drama       0.54      0.78      0.64      2697
      family       0.46      0.08      0.14       150
     fantasy       0.00      0.00      0.00        74
   game-show       0.94      0.42      0.59        40
     history       0.00      0.00      0.00        45
      horror       0.63      0.55      0.58       431
       music       0.65      0.48      0.55       144
     musical       0.25      0.02      0.04        50
     mystery       0.00      0.00      0.00        56

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
def predict_genre(plot):
    cleaned = clean_text(plot)

    vector = tfidf_vectorizer.transform([cleaned])
    pred = model.predict(vector)
    return le.inverse_transform(pred)[0]

# Example
plot = "A young boy discovers he has magical powers and attends a school for wizards."
print("Predicted Genre:", predict_genre(plot))

Predicted Genre: animation


In [None]:
!pip install streamlit

Collecting streamlit
  Downloading streamlit-1.46.1-py3-none-any.whl.metadata (9.0 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.46.1-py3-none-any.whl (10.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m50.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m62.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl (79 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hI

In [None]:

import joblib

joblib.dump(model, "genre_classifier.pkl")
joblib.dump(tfidf_vectorizer, "tfidf_vectorizer.pkl") # Changed tfidf to tfidf_vectorizer
joblib.dump(le, "label_encoder.pkl")

['label_encoder.pkl']

In [None]:
!pip install streamlit pyngrok --quiet


In [None]:
!ngrok config add-authtoken 2yUbJZnnFpb2ZL5xkybN6Ut2X3D_2RkBXG4k2mRHJYfedCorC


Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [None]:

!pkill streamlit
from pyngrok import ngrok

public_url = ngrok.connect(addr="8501")
print("🌐 Streamlit app is live at:", public_url)

!streamlit run app.py &> /dev/null &

🌐 Streamlit app is live at: NgrokTunnel: "https://3c0c-35-201-234-101.ngrok-free.app" -> "http://localhost:8501"
