In [3]:
import pandas as pd
train_data=pd.read_csv('cleaned_train.csv')
test_data =pd.read_csv('cleaned_test.csv')

In [4]:
x=train_data['description']
y=train_data['genre']
x_test=test_data['description']
y_test=test_data['genre']

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import make_pipeline

In [6]:
logistic_pipeline = make_pipeline(TfidfVectorizer(), LogisticRegression(random_state=42, max_iter=1000))
svm_pipeline = make_pipeline(TfidfVectorizer(), SVC(kernel='linear', probability=True, random_state=42))
decision_tree_pipeline = make_pipeline(TfidfVectorizer(), DecisionTreeClassifier(random_state=42))

In [7]:
voting_clf = VotingClassifier(
    estimators=[
        ('logistic', logistic_pipeline),
        ('svm', svm_pipeline),
        ('decision_tree', decision_tree_pipeline)
    ],
    voting='soft'
)

In [9]:
from sklearn.model_selection import train_test_split
X_train,X_valid,y_train,y_valid=train_test_split(x,y,test_size=0.2,random_state=42)

In [10]:
from time import time
start=time()
voting_clf.fit(X_train, y_train)
end=time()
print(f' \nTraining Time: {round((end-start),2)} sec')

 
Training Time: 4624.65 sec


In [11]:
y_pred = voting_clf.predict(X_valid)

# Evaluate the performance
print("Classification Report:")
print(classification_report(y_valid, y_pred))

print("Accuracy Score:", accuracy_score(y_valid, y_pred))

Classification Report:
               precision    recall  f1-score   support

      action        0.25      0.14      0.18       263
       adult        0.43      0.36      0.39       112
   adventure        0.17      0.09      0.11       139
   animation        0.09      0.04      0.05       104
   biography        0.00      0.00      0.00        61
      comedy        0.44      0.45      0.44      1443
       crime        0.06      0.03      0.04       107
 documentary        0.63      0.76      0.69      2659
       drama        0.51      0.65      0.57      2697
      family        0.13      0.07      0.09       150
     fantasy        0.00      0.00      0.00        74
   game-show        0.67      0.50      0.57        40
     history        0.00      0.00      0.00        45
      horror        0.45      0.50      0.47       431
       music        0.48      0.44      0.46       144
     musical        0.14      0.06      0.08        50
     mystery        0.00      0.00      0

In [13]:
import joblib
joblib.dump(voting_clf,'voting_classifier.pkl')

['voting_classifier.pkl']

In [37]:
pred=voting_clf.predict([clean(input())])
print(pred[0])

 British POWs are forced to build a railway bridge across the river Kwai for their Japanese captors in occupied Burma, not knowing that the allied forces are planning a daring commando raid through the jungle to destroy it.


 documentary 


In [16]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import re
import string

stopwords=set(stopwords.words('english'))
stemmer=PorterStemmer()
def clean(text):
    text=re.sub(r'/W',' ',text)
    text=re.sub(r'http\+S',' ',text)
    tokens=word_tokenize(text.lower())
    stemmed_words=[stemmer.stem(token) for token in tokens if token.isalpha() and token not in stopwords]
    return " ".join(stemmed_words)
    

In [25]:
vectorizer=TfidfVectorizer()

x_train_vec=vectorizer.fit_transform(X_train)
x_valid_vec=vectorizer.transform(X_valid)

In [26]:
from imblearn.ensemble import BalancedRandomForestClassifier

brf = BalancedRandomForestClassifier(n_estimators=100, random_state=42)
brf.fit(x_train_vec, y_train)


  warn(
  warn(
  warn(


In [27]:
y_pred = brf.predict(x_valid_vec)

# Evaluate the performance
print("Classification Report:")
print(classification_report(y_valid, y_pred))

print("Accuracy Score:", accuracy_score(y_valid, y_pred))

Classification Report:
               precision    recall  f1-score   support

      action        0.16      0.13      0.14       263
       adult        0.12      0.61      0.21       112
   adventure        0.12      0.14      0.13       139
   animation        0.08      0.34      0.14       104
   biography        0.01      0.03      0.01        61
      comedy        0.36      0.15      0.22      1443
       crime        0.09      0.36      0.15       107
 documentary        0.79      0.30      0.44      2659
       drama        0.54      0.09      0.16      2697
      family        0.05      0.08      0.06       150
     fantasy        0.04      0.19      0.07        74
   game-show        0.13      0.88      0.23        40
     history        0.03      0.24      0.06        45
      horror        0.38      0.36      0.37       431
       music        0.21      0.69      0.32       144
     musical        0.05      0.30      0.08        50
     mystery        0.07      0.36      0

In [29]:
brf_pipeline = make_pipeline(vectorizer, brf)

In [36]:
pred=brf_pipeline.predict([clean(input())])
print(pred[0])

 British POWs are forced to build a railway bridge across the river Kwai for their Japanese captors in occupied Burma, not knowing that the allied forces are planning a daring commando raid through the jungle to destroy it.


 war 


In [38]:
joblib.dump(brf_pipeline,'balanced random forest')

['balanced random forest']

In [39]:
from sklearn.linear_model import LogisticRegression

model_lr = make_pipeline(TfidfVectorizer(),LogisticRegression(class_weight='balanced', random_state=42, max_iter=1000))
model_lr.fit(X_train, y_train)


In [40]:
y_pred = model_lr.predict(X_valid)

# Evaluate the performance
print("Classification Report:")
print(classification_report(y_valid, y_pred))

print("Accuracy Score:", accuracy_score(y_valid, y_pred))

Classification Report:
               precision    recall  f1-score   support

      action        0.31      0.44      0.36       263
       adult        0.34      0.65      0.45       112
   adventure        0.18      0.32      0.23       139
   animation        0.21      0.28      0.24       104
   biography        0.04      0.07      0.05        61
      comedy        0.59      0.49      0.53      1443
       crime        0.15      0.33      0.21       107
 documentary        0.79      0.58      0.67      2659
       drama        0.71      0.42      0.52      2697
      family        0.14      0.33      0.20       150
     fantasy        0.13      0.16      0.14        74
   game-show        0.65      0.78      0.70        40
     history        0.08      0.24      0.12        45
      horror        0.57      0.71      0.63       431
       music        0.36      0.80      0.50       144
     musical        0.11      0.26      0.16        50
     mystery        0.12      0.23      0