In [32]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

In [35]:
train_data = pd.read_csv('/content/train_data.txt', sep=' ::: ', engine='python', names=['Title', 'Genre', 'Description'], nrows=6000)
train_data.head()

Unnamed: 0,Title,Genre,Description
1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his doc...
2,Cupid (1997),thriller,A brother and sister with a past incestuous re...
3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fiel...
4,The Secret Sin (1915),drama,To help their unemployed father make ends meet...
5,The Unrecovered (2007),drama,The film's title refers not only to the un-rec...


In [36]:
print(train_data.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6000 entries, 1 to 9133
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Title        6000 non-null   object
 1   Genre        6000 non-null   object
 2   Description  6000 non-null   object
dtypes: object(3)
memory usage: 187.5+ KB
None


In [37]:
print(train_data.isnull().sum())

Title          0
Genre          0
Description    0
dtype: int64


In [39]:
test_data = pd.read_csv('/content/test_data.txt', sep=' ::: ', engine='python', names=['Title', 'Genre', 'Description'], nrows=6000)
test_data.head()

Unnamed: 0,Title,Genre,Description
0,1,Edgar's Lunch (1998),"L.R. Brane loves his life - his car, his apart..."
1,2,La guerra de papá (1977),"Spain, March 1964: Quico is a very naughty chi..."
2,3,Off the Beaten Track (2010),One year in the life of Albin and his family o...
3,4,Meu Amigo Hindu (2015),"His father has died, he hasn't spoken with his..."
4,5,Er nu zhai (1955),Before he was known internationally as a marti...


In [41]:
def clean_text(text):
    text = text.lower()  # Lowercase all characters
    text = re.sub(r'@\S+', '', text)  # Remove Twitter handles
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'pic.\S+', '', text)
    text = re.sub(r"[^a-zA-Z+']", ' ', text)  # Keep only characters
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text + ' ')  # Keep words with length > 1 only
    text = "".join([i for i in text if i not in string.punctuation])
    words = nltk.word_tokenize(text)
    stopwords = nltk.corpus.stopwords.words('english')  # Remove stopwords
    text = " ".join([i for i in words if i not in stopwords and len(i) > 2])
    text = re.sub("\s[\s]+", " ", text).strip()  # Remove repeated/leading/trailing spaces
    return text

In [43]:
!pip install nltk



In [47]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [48]:
# Apply the clean_text function to the 'Description' column in the training and test data
train_data['Text_cleaning'] = train_data['Description'].apply(clean_text)
test_data['Text_cleaning'] = test_data['Description'].apply(clean_text)

In [50]:
tfidf_vectorizer = TfidfVectorizer()
X_train = tfidf_vectorizer.fit_transform(train_data['Text_cleaning'])
X_test = tfidf_vectorizer.transform(test_data['Text_cleaning'])

In [51]:
X = X_train
y = train_data['Genre']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [52]:
classifier = MultinomialNB()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print("Validation Accuracy:", accuracy)
print(classification_report(y_val, y_pred, zero_division=0.0))

Validation Accuracy: 0.4116666666666667
              precision    recall  f1-score   support

      action       0.00      0.00      0.00        36
       adult       0.00      0.00      0.00        22
   adventure       0.00      0.00      0.00        15
   animation       0.00      0.00      0.00         7
   biography       0.00      0.00      0.00         2
      comedy       0.00      0.00      0.00       170
       crime       0.00      0.00      0.00        11
 documentary       0.55      0.82      0.66       297
       drama       0.33      0.90      0.48       276
      family       0.00      0.00      0.00        22
     fantasy       0.00      0.00      0.00         7
   game-show       0.00      0.00      0.00         5
     history       0.00      0.00      0.00        10
      horror       0.00      0.00      0.00        45
       music       0.00      0.00      0.00        12
     musical       0.00      0.00      0.00         6
     mystery       0.00      0.00      0.

In [53]:
classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print("Validation Accuracy:", accuracy)
print(classification_report(y_val, y_pred, zero_division=0.0))

Validation Accuracy: 0.465
              precision    recall  f1-score   support

      action       0.00      0.00      0.00        36
       adult       0.00      0.00      0.00        22
   adventure       0.00      0.00      0.00        15
   animation       0.00      0.00      0.00         7
   biography       0.00      0.00      0.00         2
      comedy       0.50      0.31      0.38       170
       crime       0.00      0.00      0.00        11
 documentary       0.54      0.88      0.67       297
       drama       0.40      0.87      0.54       276
      family       0.00      0.00      0.00        22
     fantasy       0.00      0.00      0.00         7
   game-show       0.00      0.00      0.00         5
     history       0.00      0.00      0.00        10
      horror       1.00      0.02      0.04        45
       music       0.00      0.00      0.00        12
     musical       0.00      0.00      0.00         6
     mystery       0.00      0.00      0.00         9


In [54]:
X_test_predictions = classifier.predict(X_test)
test_data['Predicted_Genre'] = X_test_predictions
test_data.to_csv('predicted_genres.csv', index=False)
print(test_data)

      Title                             Genre  \
0         1              Edgar's Lunch (1998)   
1         2          La guerra de papá (1977)   
2         3       Off the Beaten Track (2010)   
3         4            Meu Amigo Hindu (2015)   
4         5                 Er nu zhai (1955)   
...     ...                               ...   
5995   7645               Do Your Duty (1928)   
5996   7646               Vamik's Room (2016)   
5997   7647            United We Stand (2003)   
5998   7648  "The Steam Video Company" (1984)   
5999   7649             "Dave's Place" (1965)   

                                            Description  \
0     L.R. Brane loves his life - his car, his apart...   
1     Spain, March 1964: Quico is a very naughty chi...   
2     One year in the life of Albin and his family o...   
3     His father has died, he hasn't spoken with his...   
4     Before he was known internationally as a marti...   
...                                                 ...  