In [31]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

In [3]:
# Load the training data
train_data = pd.read_csv("train_data.txt", delimiter=':::', header=None, engine='python')
train_data.columns = ["id", "Title", "Genre", "Description"]

In [4]:
# Load the test data
test_data = pd.read_csv("test_data.txt", delimiter=':::', header=None, engine='python')

In [7]:
# Load the solutions for the test data (if available)
test_data_solution = pd.read_csv("test_data_solution.txt", delimiter=':::', header=None, engine='python')
test_data_solution.columns = ["id", "Title", "Genre", "Description"]


In [8]:
# Concatenate the training data and test data solutions (if available)
df = pd.concat([train_data, test_data_solution])
df.columns = ["id", "Title", "Genre", "Description"]


In [9]:
print(df.head())

   id                               Title       Genre  \
0   1       Oscar et la dame rose (2009)       drama    
1   2                       Cupid (1997)    thriller    
2   3   Young, Wild and Wonderful (1980)       adult    
3   4              The Secret Sin (1915)       drama    
4   5             The Unrecovered (2007)       drama    

                                         Description  
0   Listening in to a conversation between his do...  
1   A brother and sister with a past incestuous r...  
2   As the bus empties the students for their fie...  
3   To help their unemployed father make ends mee...  
4   The film's title refers not only to the un-re...  


In [11]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\om\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\om\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\om\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [12]:
# Define functions for text preprocessing
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Remove punctuation using regular expressions
    text = re.sub(r'[^\w\s]', '', text)
    
    # Tokenize the text into words
    tokens = word_tokenize(text)
    
    # Lemmatize the words
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Rejoin the processed words into a clean text
    return ' '.join(tokens)


In [14]:
nltk.download('omw-1.4')


[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\om\AppData\Roaming\nltk_data...


True

In [15]:

# Apply the preprocessing function to the "Description" column
df["Description"] = df["Description"].apply(preprocess_text)

In [16]:
df.head()

Unnamed: 0,id,Title,Genre,Description
0,1,Oscar et la dame rose (2009),drama,listening conversation doctor parent 10yearold...
1,2,Cupid (1997),thriller,brother sister past incestuous relationship cu...
2,3,"Young, Wild and Wonderful (1980)",adult,bus empty student field trip museum natural hi...
3,4,The Secret Sin (1915),drama,help unemployed father make end meet edith twi...
4,5,The Unrecovered (2007),drama,film title refers unrecovered body ground zero...


In [17]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # You can adjust the number of features
X = tfidf_vectorizer.fit_transform(df['Description'])

In [19]:

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['Genre'])

In [22]:
print(y)

[ 8 24  1 ...  1  8  8]


In [51]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [36]:
model = LogisticRegression(max_iter=1000)  # You can adjust hyperparameters


In [37]:
model.fit(X_train, y_train)


LogisticRegression(max_iter=1000)

In [38]:
y_pred = model.predict(X_test)
class_names = df['Genre'].unique()

In [39]:
print(classification_report(y_test, y_pred, target_names=class_names))


               precision    recall  f1-score   support

       drama        0.48      0.33      0.39       526
    thriller        0.68      0.31      0.42       228
       adult        0.55      0.22      0.31       304
 documentary        0.56      0.11      0.19       211
      comedy        0.00      0.00      0.00       103
       crime        0.56      0.60      0.58      2988
  reality-tv        0.32      0.05      0.09       223
      horror        0.68      0.83      0.75      5185
       sport        0.57      0.77      0.66      5550
   animation        0.42      0.14      0.20       288
      action        0.43      0.06      0.11       147
     fantasy        0.81      0.48      0.60        73
       short        0.00      0.00      0.00       116
      sci-fi        0.67      0.63      0.65       883
       music        0.65      0.48      0.55       300
   adventure        0.64      0.06      0.11       114
   talk-show        0.28      0.06      0.10       114
     west

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [40]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.5988101277498501
