In [18]:
import pandas as pd
import re

# Define the regex pattern to extract the information
pattern = r'^(\d+) ::: (.+) \((\d{4})\) ::: (.+) ::: (.+)$'

# Store the cleaned data
data = []

# Read the file and process each line
with open('/content/drive/MyDrive/train_data.txt', 'r') as file:
    for line in file:
        # Strip leading/trailing whitespace
        line = line.strip()

        # Use regex to extract the parts of the line
        match = re.match(pattern, line)
        if match:
            index = int(match.group(1))  # Movie index
            title = match.group(2)       # Movie title
            year = int(match.group(3))   # Release year
            genre = match.group(4)       # Genre
            plot = match.group(5)        # Plot summary

            # Append extracted data to the list
            data.append([index, title, year, genre, plot])

# Create a DataFrame from the parsed data
df = pd.DataFrame(data, columns=['Index', 'Title', 'Year', 'Genre', 'Plot Summary'])

# Display the DataFrame to check its contents
print(df)

# Save the DataFrame to a CSV file if needed
df.to_csv('cleaned_movies_data.csv', index=False)


       Index                                Title  Year        Genre  \
0          1                Oscar et la dame rose  2009        drama   
1          2                                Cupid  1997     thriller   
2          3            Young, Wild and Wonderful  1980        adult   
3          4                       The Secret Sin  1915        drama   
4          5                      The Unrecovered  2007        drama   
...      ...                                  ...   ...          ...   
49862  54209                Izpiti po nikoe vreme  1974       family   
49863  54210                             "Bonino"  1953       comedy   
49864  54212   Ronald Goedemondt: Ze bestaan echt  2008  documentary   
49865  54213                    Make Your Own Bed  1944       comedy   
49866  54214  Nature's Fury: Storm of the Century  2006      history   

                                            Plot Summary  
0      Listening in to a conversation between his doc...  
1      A brother 

In [19]:
df.describe()

Unnamed: 0,Index,Year
count,49867.0,49867.0
mean,27126.276455,1998.7668
std,15649.153347,22.914058
min,1.0,1894.0
25%,13576.5,1995.0
50%,27128.0,2008.0
75%,40674.5,2013.0
max,54214.0,2022.0


In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49867 entries, 0 to 49866
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Index         49867 non-null  int64 
 1   Title         49867 non-null  object
 2   Year          49867 non-null  int64 
 3   Genre         49867 non-null  object
 4   Plot Summary  49867 non-null  object
dtypes: int64(2), object(3)
memory usage: 1.9+ MB


In [22]:
import re  # Regular expressions for text cleaning
import nltk  # Natural Language Toolkit for text processing
from nltk.corpus import stopwords  # Stopwords list
from sklearn.feature_extraction.text import TfidfVectorizer  # TF-IDF Vectorizer to convert text to numerical features

# Download stopwords if not already present
nltk.download('stopwords')

# Function to clean text
def clean_text(text):
    # Remove all non-alphabetic characters (numbers, punctuation, etc.)
    text = re.sub(r'\W', ' ', text)

    # Convert the entire text to lowercase
    text = text.lower()

    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)

    # Strip leading and trailing spaces
    text = text.strip()

    return text

# Apply the cleaning function to the 'Plot Summary' column in your dataset
df['Cleaned Plot'] = df['Plot Summary'].apply(clean_text)

# Display the first few rows of the cleaned dataset to check if the cleaning worked
print(df[['Plot Summary', 'Cleaned Plot']].head())

# Initialize the TF-IDF Vectorizer
# - max_features: Limit to top 5000 words to reduce dimensionality
# - stop_words: Remove stopwords from the text (like 'the', 'is', etc.)
# Use 'english' to use the built in stop word list for english
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')

# Fit the TF-IDF vectorizer on the cleaned 'Plot Summary' text
# - This converts the text data into a numerical format suitable for machine learning models
X = tfidf.fit_transform(df['Cleaned Plot']).toarray()

# Check the shape of the resulting feature matrix (X)
# - The number of rows corresponds to the number of movies (data points)
# - The number of columns corresponds to the number of features (top 5000 words by frequency)
print(f"Shape of feature matrix: {X.shape}")

# Optional: Display the first few feature vectors to inspect the TF-IDF representation
print(X[:5])  # Print the first 5 rows of the matrix

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                                        Plot Summary  \
0  Listening in to a conversation between his doc...   
1  A brother and sister with a past incestuous re...   
2  As the bus empties the students for their fiel...   
3  To help their unemployed father make ends meet...   
4  The film's title refers not only to the un-rec...   

                                        Cleaned Plot  
0  listening in to a conversation between his doc...  
1  a brother and sister with a past incestuous re...  
2  as the bus empties the students for their fiel...  
3  to help their unemployed father make ends meet...  
4  the film s title refers not only to the un rec...  
Shape of feature matrix: (49867, 5000)
[[0.         0.13315529 0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [

In [25]:
import joblib
from sklearn.preprocessing import LabelEncoder

# Initialize and fit the label encoder
label_encoder = LabelEncoder()
label_encoder.fit(df['Genre'])

# Save the label encoder to a file
joblib.dump(label_encoder, 'label_encoder.joblib')


['label_encoder.joblib']

In [26]:
from sklearn.preprocessing import LabelEncoder
# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the 'Genre' column
df['Encoded Genre'] = label_encoder.fit_transform(df['Genre'])


In [27]:
print(df[['Genre', 'Encoded Genre']].drop_duplicates())


            Genre  Encoded Genre
0           drama              8
1        thriller             24
2           adult              1
5     documentary              7
6          comedy              5
7           crime              6
8      reality-tv             18
17          sport             22
18      animation              3
23         action              0
24          short             21
25         sci-fi             20
28         horror             13
32          music             14
46      talk-show             23
69        western             26
75         family              9
76        mystery             16
104     adventure              2
119       history             12
164          news             17
252       romance             19
298     biography              4
618       fantasy             10
672     game-show             11
847       musical             15
1074          war             25


In [35]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
import joblib

# Load the preprocessed data
df = pd.read_csv('cleaned_movies_data.csv')

# Function to clean text
def clean_text(text):
    text = re.sub(r'\W', ' ', text)  # Remove non-alphabetic characters
    text = text.lower()             # Convert to lowercase
    text = re.sub(r'\s+', ' ', text) # Replace multiple spaces with a single space
    text = text.strip()             # Strip leading and trailing spaces
    return text

df['Cleaned Plot'] = df['Plot Summary'].apply(clean_text)

# Load the TF-IDF Vectorizer and Label Encoder
tfidf_vectorizer = joblib.load('tfidf_vectorizer.joblib')
label_encoder = joblib.load('label_encoder.joblib')

# Transform the cleaned plot summaries into TF-IDF features
X = tfidf_vectorizer.transform(df['Cleaned Plot'])  # Use sparse matrix

# Encode the genre labels
y = label_encoder.transform(df['Genre'])

# Subset the data to include only the first 10000 samples
X_subset = X[:30000]  # Selects the first 30000 rows from the feature matrix X
y_subset = y[:30000]  # Selects the first 30000 labels from the target variable y

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_subset, y_subset, test_size=0.2, random_state=42)

# Initialize the SVM classifier (you can modify the parameters as needed)
svm = SVC(kernel='linear', C=1.0)  # Using a linear kernel here as an example

# Train the SVM model on the subset of the data
svm.fit(X_train, y_train)

# Check if the model is trained successfully
print("Model trained on the first 30000 datasets to consume time(edit the code to increase accuracy)")

# Save the trained SVM model
joblib.dump(svm, 'svm_model.joblib')


Model trained on the first 30000 datasets to consume time(edit the code to increase accuracy)


['svm_model.joblib']

In [36]:
from sklearn.metrics import accuracy_score

# Test the model on another subset of data
# Let's use rows from 30000 to 30000 as a test set for simplicity
X_test = X[30000:32000]
y_test = y[30000:32000]

# Use the trained SVM model to make predictions on the test set
y_pred = svm.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)

# Display the accuracy
print(f"Accuracy of the model on test set: {accuracy * 100:.2f}%")


Accuracy of the model on test set: 56.55%


In [39]:
from sklearn.preprocessing import LabelEncoder

# Re-initialize the label encoder and fit it on the target data if not already done
le = LabelEncoder()
y_encoded = le.fit_transform(df['Genre'])  # Encode the 'Genre' column if you haven't done this earlier

# Define a function to predict the genre based on a new plot summary
def predict_genre(summary):
    # Clean the input summary using the same cleaning function
    cleaned_summary = clean_text(summary)

    # Transform the cleaned summary using the trained TF-IDF vectorizer
    summary_tfidf = tfidf.transform([cleaned_summary]).toarray()

    # Predict the genre using the trained SVM model
    predicted_genre_encoded = svm.predict(summary_tfidf)

    # Decode the predicted label to get the genre name
    predicted_genre = le.inverse_transform(predicted_genre_encoded)

    return predicted_genre[0]

# Test with a sample summary
sample_summary = "When big business meets big charity, the partnership can be very profitable for both. At the same time this modern-day alliance presents interesting ethical questions about the nature of corporate altruism. Philanthropy Inc. tells the story of this new phenomenon using three case studies involving three of the world's largest corporations and three of the world's largest charities. Each story is an example of how the partnership of business and charity can work, and sometimes not work. Philanthropy, Inc., explores the ethical questions by taking them to the people who know this issue best. We speak to the leaders at the forefront of the movement and examine high profile case studies involving some of North America's corporate giants like Coca-Cola, Wal-Mart and General Mills."

# Get the predicted genre
predicted_genre = predict_genre(sample_summary)
print(f"Predicted Genre: {predicted_genre}")


Predicted Genre: documentary


In [40]:
# Test with a sample summary
sample_summary = "A group of teenagers discover a mysterious object in the woods, leading to strange occurrences and a fight for survival."

# Get the predicted genre
predicted_genre = predict_genre(sample_summary)
print(f"Predicted Genre: {predicted_genre}")


Predicted Genre: horror


In [41]:
import joblib

# Save the trained model to a file
filename = 'svm_model.joblib'
joblib.dump(svm, filename)

print(f"Model saved as {filename}")

Model saved as svm_model.joblib
