In [2]:
!pip install nltk


Defaulting to user installation because normal site-packages is not writeable
Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ------------- -------------------------- 0.5/1.5 MB 2.4 MB/s eta 0:00:01
   -------------------- ------------------- 0.8/1.5 MB 2.2 MB/s eta 0:00:01
   ---------------------------------- ----- 1.3/1.5 MB 2.2 MB/s eta 0:00:01
   ---------------------------------------- 1.5/1.5 MB 2.2 MB/s  0:00:00
Installing collected packages: nltk
Successfully installed nltk-3.9.1


In [3]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [4]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer

In [5]:
# Load the training data
train_df = pd.read_csv('train_data.txt', sep=':::', names=['ID', 'TITLE', 'GENRE', 'DESCRIPTION'], engine='python')

# Load the test data
test_df = pd.read_csv('test_data.txt', sep=':::', names=['ID', 'TITLE', 'DESCRIPTION'], engine='python')

# Text ko saaf karne ke liye function
stemmer = LancasterStemmer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove punctuation
    words = text.split()
    words = [stemmer.stem(word) for word in words if word not in stop_words] # Remove stop words and apply stemming
    return ' '.join(words)

# Apply the cleaning function to the description columns
train_df['DESCRIPTION'] = train_df['DESCRIPTION'].apply(clean_text)
test_df['DESCRIPTION'] = test_df['DESCRIPTION'].apply(clean_text)

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

# Fit and transform the training data
X_train = tfidf_vectorizer.fit_transform(train_df['DESCRIPTION'])

# Transform the test data
X_test = tfidf_vectorizer.transform(test_df['DESCRIPTION'])

In [7]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

# Target variable
y_train = train_df['GENRE']

# Naive Bayes Model
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

# Logistic Regression Model
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)

# Support Vector Machine (SVM) Model
svm_model = LinearSVC()
svm_model.fit(X_train, y_train)



In [8]:
from sklearn.metrics import accuracy_score

# Load the test solution data
test_solution_df = pd.read_csv('test_data_solution.txt', sep=':::', names=['ID', 'TITLE', 'GENRE', 'DESCRIPTION'], engine='python')
y_test = test_solution_df['GENRE']

# Naive Bayes Predictions
y_pred_nb = nb_model.predict(X_test)
accuracy_nb = accuracy_score(y_test, y_pred_nb)
print(f"Naive Bayes Accuracy: {accuracy_nb}")

# Logistic Regression Predictions
y_pred_lr = lr_model.predict(X_test)
accuracy_lr = accuracy_score(y_test, y_pred_lr)
print(f"Logistic Regression Accuracy: {accuracy_lr}")

# SVM Predictions
y_pred_svm = svm_model.predict(X_test)
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print(f"SVM Accuracy: {accuracy_svm}")

Naive Bayes Accuracy: 0.5102029520295203
Logistic Regression Accuracy: 0.5803505535055351
SVM Accuracy: 0.567859778597786


In [9]:
# New movie plot
new_plot = "A young boy discovers he has magical powers and goes to a special school for wizards."

# Clean the new plot
cleaned_plot = clean_text(new_plot)

# Transform the cleaned plot using the TfidfVectorizer
new_plot_vector = tfidf_vectorizer.transform([cleaned_plot])

# Predict the genre using the best model (let's assume SVM was best)
predicted_genre = svm_model.predict(new_plot_vector)
print(f"Predicted Genre: {predicted_genre[0]}")

Predicted Genre:  fantasy 


In [10]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=10000)

X_train = tfidf_vectorizer.fit_transform(train_df['DESCRIPTION'])
X_test = tfidf_vectorizer.transform(test_df['DESCRIPTION'])

y_train = train_df['GENRE']
y_test = test_solution_df['GENRE']

In [11]:
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)

# Step 5: Model ko Test Karna
y_pred_lr = lr_model.predict(X_test)
accuracy_lr_ngram = accuracy_score(y_test, y_pred_lr)

print("Pehle wali accuracy (bina N-grams): 58.0%")
print(f"Nayi accuracy (N-grams ke saath): {accuracy_lr_ngram * 100:.1f}%")

Pehle wali accuracy (bina N-grams): 58.0%
Nayi accuracy (N-grams ke saath): 58.6%


In [13]:
svm_model = LinearSVC()
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)
accuracy_svm_ngram = accuracy_score(y_test, y_pred_svm)

# Naive Bayes Model
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
y_pred_nb = nb_model.predict(X_test)
accuracy_nb_ngram = accuracy_score(y_test, y_pred_nb)

# Step 5: Results ko Compare karna
print("--- SVM Model ---")
print("Pehle wali accuracy (bina N-grams): 56.8%")
print(f"Nayi accuracy (N-grams ke saath): {accuracy_svm_ngram * 100:.1f}%\n")

print("--- Naive Bayes Model ---")
print("Pehle wali accuracy (bina N-grams): 51.0%")
print(f"Nayi accuracy (N-grams ke saath): {accuracy_nb_ngram * 100:.1f}%")



--- SVM Model ---
Pehle wali accuracy (bina N-grams): 56.8%
Nayi accuracy (N-grams ke saath): 57.0%

--- Naive Bayes Model ---
Pehle wali accuracy (bina N-grams): 51.0%
Nayi accuracy (N-grams ke saath): 50.8%


In [15]:
from sklearn.model_selection import GridSearchCV
import re


In [17]:
lr_model = LogisticRegression(max_iter=1000)
param_grid = {'C': [0.1, 1, 10]} # C ki values jinko test karna hai
grid_search = GridSearchCV(estimator=lr_model, param_grid=param_grid, cv=3)
print("Hyperparameter tuning shuru ho gaya hai...")
grid_search.fit(X_train, y_train)

Hyperparameter tuning shuru ho gaya hai...


In [18]:
print(f"\nBest 'C' value mili: {grid_search.best_params_['C']}")
best_lr_model = grid_search.best_estimator_
y_pred = best_lr_model.predict(X_test)
final_accuracy = accuracy_score(y_test, y_pred)

print(f"Pehle wali accuracy (bina tuning): 58.6%")
print(f"Final accuracy (Hyperparameter Tuning ke baad): {final_accuracy * 100:.1f}%")



Best 'C' value mili: 1
Pehle wali accuracy (bina tuning): 58.6%
Final accuracy (Hyperparameter Tuning ke baad): 58.6%


In [22]:
import joblib
print("Saving vectorizer and model to files...")

# Save the vectorizer object to a file
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')

# Save the trained model to a file
joblib.dump(lr_model, 'genre_classifier.pkl')

print("\nModel and vectorizer have been saved successfully! ✅")

Saving vectorizer and model to files...

Model and vectorizer have been saved successfully! ✅


In [21]:
import sklearn

print(f"Scikit-learn version: {sklearn.__version__}")

Scikit-learn version: 1.3.2
