In [1]:
# Import the libraries needed for our task
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler


In [2]:
# Load our datasets: training and testing data
trainset = pd.read_csv("train_data.csv", delimiter=":::", engine='python', header=None, names=["id", "moviename", "genre", "plot"])
testset = pd.read_csv("test_data.csv", delimiter=":::", engine='python', header=None, names=["id", "moviename", "plot"])


In [5]:
trainset_head = trainset.head()
print(trainset_head)

   id                           moviename       genre  \
0   1       Oscar et la dame rose (2009)       drama    
1   2                       Cupid (1997)    thriller    
2   3   Young, Wild and Wonderful (1980)       adult    
3   4              The Secret Sin (1915)       drama    
4   5             The Unrecovered (2007)       drama    

                                                plot  
0   Listening in to a conversation between his do...  
1   A brother and sister with a past incestuous r...  
2   As the bus empties the students for their fie...  
3   To help their unemployed father make ends mee...  
4   The film's title refers not only to the un-re...  


In [6]:
# Get the shape of our training dataset
trainset_shape = trainset.shape
print("\nShape of training dataset (rows, columns):", trainset_shape)



Shape of training dataset (rows, columns): (54214, 4)


In [7]:
# Get concise summary of our training dataset
trainset_info = trainset.info()
print("\nInformation about training dataset:")
print(trainset_info)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54214 entries, 0 to 54213
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         54214 non-null  int64 
 1   moviename  54214 non-null  object
 2   genre      54214 non-null  object
 3   plot       54214 non-null  object
dtypes: int64(1), object(3)
memory usage: 1.7+ MB

Information about training dataset:
None


In [8]:
# Get descriptive statistics of our training dataset
trainset_describe = trainset.describe()
print("\nDescriptive statistics of training dataset:")
print(trainset_describe)



Descriptive statistics of training dataset:
                 id
count  54214.000000
mean   27107.500000
std    15650.378084
min        1.000000
25%    13554.250000
50%    27107.500000
75%    40660.750000
max    54214.000000


In [9]:
#Display the first few rows of our testing dataset
testset_head = testset.head()
print("\nFirst few rows of testing dataset:")
print(testset_head)


First few rows of testing dataset:
   id                      moviename  \
0   1          Edgar's Lunch (1998)    
1   2      La guerra de papá (1977)    
2   3   Off the Beaten Track (2010)    
3   4        Meu Amigo Hindu (2015)    
4   5             Er nu zhai (1955)    

                                                plot  
0   L.R. Brane loves his life - his car, his apar...  
1   Spain, March 1964: Quico is a very naughty ch...  
2   One year in the life of Albin and his family ...  
3   His father has died, he hasn't spoken with hi...  
4   Before he was known internationally as a mart...  


In [10]:
# Get the shape of our testing dataset
testset_shape = testset.shape
print("\nShape of testing dataset (rows, columns):", testset_shape)




Shape of testing dataset (rows, columns): (54200, 3)


In [11]:
#Get concise summary of our testing dataset
testset_info = testset.info()
print("\nInformation about testing dataset:")
print(testset_info)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54200 entries, 0 to 54199
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         54200 non-null  int64 
 1   moviename  54200 non-null  object
 2   plot       54200 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.2+ MB

Information about testing dataset:
None


In [12]:
# Get descriptive statistics of our testing dataset
testset_describe = testset.describe()
print("\nDescriptive statistics of testing dataset:")
print(testset_describe)



Descriptive statistics of testing dataset:
                 id
count  54200.000000
mean   27100.500000
std    15646.336632
min        1.000000
25%    13550.750000
50%    27100.500000
75%    40650.250000
max    54200.000000


In [13]:
# Prepare our text data for analysis using TF-IDF method
# This helps us understand which words are important in our plots
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(trainset['plot'])  # Extract features from training plot descriptions
y_train = trainset['genre']  # Identify the genres for our training data


In [14]:
# Normalize our training data to ensure all features have the same scale
scaler = StandardScaler(with_mean=False)  # Scale without changing the mean to avoid sparse matrix issues
X_train_scaled = scaler.fit_transform(X_train)  # Scale our training features


In [16]:
# Train our classifier, a logistic regression model
# This model learns patterns in our training data to predict genres
classifier = LogisticRegression(max_iter=5000)
classifier.fit(X_train_scaled, y_train)  # Fit the model to our scaled training data



In [17]:
# Prepare our test data for prediction using the same TF-IDF vectorizer and scaler
X_test = vectorizer.transform(testset['plot'])  # Transform test plot descriptions into TF-IDF vectors
X_test_scaled = scaler.transform(X_test)  # Scale the test data using the same scaler fitted on the training data


In [18]:
# Predict genres for the test set plots
predicted_genres = classifier.predict(X_test_scaled)



In [19]:
# Display the predicted genres
print("\nPredicted genres:")
print(predicted_genres)



Predicted genres:
[' romance ' ' drama ' ' documentary ' ... ' comedy ' ' short ' ' short ']
