In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

In [None]:
# Load the dataset
data = pd.read_csv('/content/wiki_movie_plots_deduped.csv')

In [None]:
# Step 1: Data Cleaning and Filtering
# Select relevant columns and drop rows with missing values
data_cleaned = data[['Plot', 'Genre']].dropna()

In [None]:
# Remove rows with the genre "unknown"
data_filtered = data_cleaned[data_cleaned['Genre'] != 'unknown']

In [None]:
data_filtered.head()

Unnamed: 0,Plot,Genre
6,The film opens with two bandits breaking into ...,western
7,The film is about a family who move to the sub...,comedy
10,The Rarebit Fiend gorges on Welsh rarebit at a...,short
11,The film features a train traveling through th...,short action/crime western
12,Irish villager Kathleen is a tenant of Captain...,short film


In [None]:
data.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr..."
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov..."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed..."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...


In [None]:
# Focus on the top 5 genres by frequency for a balanced classification task
top_genres = data_filtered['Genre'].value_counts().head(5).index
data_top_genres = data_filtered[data_filtered['Genre'].isin(top_genres)]

In [None]:
# Step 2: Splitting the Data
X = data_top_genres['Plot']
y = data_top_genres['Genre']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [None]:
X_train

Unnamed: 0,Plot
26884,"The plot is set in Chandigarh, where a wedding..."
28698,In a tribal village of Maharashtra is a school...
4481,Set in the English village Penny Green in 1939...
34863,Şaban Agha (Şevket Emrulla) lives in a small M...
20656,In a working class South London district lives...
...,...
13229,"Amidst a storm, Finnegan and his crew, Joey an..."
4810,County attorney Dave Connors is stuck in an un...
26442,When Roshni and Sudeep take their nine-year-ol...
19769,"Diana Scott (Julie Christie) is a beautiful, b..."


In [None]:
X_test

Unnamed: 0,Plot
11093,The childless wife of a small town doctor in N...
4241,"A reporter marries a dying girl for her money,..."
5603,"Mae Doyle returns to her home town, the fishin..."
23390,"In Cambodia 1975, the eight-year-old Wai Lok-y..."
19586,"John Saunders (Bygraves), a supply teacher wit..."
...,...
19625,Victor 'Vic' Brown (Bates) is a draughtsman in...
32151,Chandu (Ravi Teja)'s entire life is about his ...
4322,Amy Fisher's parents can't understand what the...
2296,"Stella Martin, the daughter of a mill worker, ..."


In [None]:
y_train

Unnamed: 0,Genre
26884,comedy
28698,comedy
4481,drama
34863,drama
20656,drama
...,...
13229,horror
4810,drama
26442,horror
19769,drama


In [None]:
# Step 3: Text Vectorization using TF-IDF
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [None]:
# Step 4: Model Training
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

In [None]:
# Step 5: Model Evaluation
y_pred = model.predict(X_test_tfidf)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

In [None]:
# Display Results
print("Accuracy:", accuracy)
print("Classification Report:\n", report)

Accuracy: 0.6173112338858195
Classification Report:
               precision    recall  f1-score   support

      action       0.66      0.20      0.31       220
      comedy       0.67      0.57      0.62       876
       drama       0.58      0.86      0.69      1193
      horror       0.85      0.43      0.57       233
    thriller       0.33      0.01      0.01       193

    accuracy                           0.62      2715
   macro avg       0.62      0.41      0.44      2715
weighted avg       0.62      0.62      0.58      2715

