In [None]:
pip install selenium pandas numpy nltk

In [33]:
import os
import time
import pandas as pd
import numpy as np
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException, NoSuchElementException, ElementClickInterceptedException

In [34]:
def webscraper(url):
    driver = webdriver.Chrome()
    
    try:
        driver.get(url)
        driver.maximize_window()
        time.sleep(2)
        print(driver.title)

        # Clicking "Read More" to load all data
        while True:
            try:
                element = driver.find_element(By.XPATH, '//*[@id="__next"]/main/div[2]/div[3]/section/section/div/section/section/div[2]/div/section/div[2]/div[2]/div[2]/div/span/button')
                driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", element)
                time.sleep(1)
                element.click()
                print("Clicked 'Read More' button.")
                time.sleep(1)
            except NoSuchElementException:
                print("No 'Read More' button found. All data loaded.")
                break
            except (ElementClickInterceptedException, TimeoutException):
                print("Retrying...")
                time.sleep(2)
            except Exception as e:
                print(f"Unexpected error: {e}")
                break

        print("Successfully retrieved all the data.")

        # Initialize a list to store movie data
        movies_data = []
        movies = driver.find_elements(By.XPATH, '//*[@id="__next"]/main/div[2]/div[3]/section/section/div/section/section/div[2]/div/section/div[2]/div[2]/ul/li')

        for movie in movies:
            try:
                # Extract the movie name
                name = movie.find_element(By.CSS_SELECTOR, 'h3[class="ipc-title__text"]').text.split(". ", 1)[1]

                # Extract the storyline
                try:
                    story = movie.find_element(By.CSS_SELECTOR, 'div[class="ipc-html-content-inner-div"]').text  # Update this class selector
                except NoSuchElementException:
                    story = "No Storyline Available"

                movies_data.append({
                    "Movie Name": name,
                    "Storyline": story
                })

            except Exception as e:
                print(f"Error processing movie: {e}")

        return movies_data  # Return a list instead of a dictionary

    except Exception as e:
        print(f"Error retrieving movie list: {e}")
        return []  # Return an empty list if an error occurs

    finally:
        driver.quit()

In [35]:
 # Save data to CSV files

def movies_dataset(movies_data):
    # Create a new folder to save the CSV files, if it doesn't already exist
    output_dir = r"D:\Guvi_Project\IMDB Movie Recommendation System Using Storylines\data"

    # Use os.makedirs to create the directory, with 'exist_ok=True' to avoid error if folder already exists
    os.makedirs(output_dir, exist_ok=True)

    df = pd.DataFrame(movies_data)
    file_name = os.path.join(output_dir, f"IMDb_Movies_story.csv") 
    df.to_csv(file_name, index=False)
        

In [None]:
IMDb_url = "https://www.imdb.com/search/title/?title_type=feature&release_date=2024-01-01,2024-12-31&user_rating=1,&adult=include&sort=release_date,desc"

try:
    print(f"URL Processing: {IMDb_url}")
        
    # Call the webscraper function to scrape movie data
    movies_story = webscraper(IMDb_url)

    # Check if data is valid (a non-empty list)
    if movies_story:  
        try:
            # Save the data to CSV
            movies_dataset(movies_story)  # Ensure this function is defined
            print(f"Successfully stored")
        except Exception as dataset_error:
            print(f"Error saving dataset for {IMDb_url}: {dataset_error}")
    else:
        print(f"Skipping {IMDb_url} as no valid data was retrieved.")

except Exception as e:
    print(f"Error processing {IMDb_url}: {e}")

print('Successfully completed processing!')

In [36]:
df = pd.read_csv(r"D:\Guvi_Project\IMDB Movie Recommendation System Using Storylines\data\IMDb_Movies_story.csv")
df.head()

Unnamed: 0,Movie Name,Storyline
0,X Files,In the world there are thousands of mysteries ...
1,The Forest,A young man believes he is the last person ali...
2,The Buildout,A friendship is tested as two women experience...
3,The Goat,Forced to marry one of the elders in her Egypt...
4,An Intruder Among Us,No Storyline Available


Text Preprocessing

In [37]:
# df.drop(columns = ['Movie Name'], inplace = True)
df.drop_duplicates(subset = ['Storyline'], inplace = True)
df.reset_index(drop = True, inplace = True)
df

Unnamed: 0,Movie Name,Storyline
0,X Files,In the world there are thousands of mysteries ...
1,The Forest,A young man believes he is the last person ali...
2,The Buildout,A friendship is tested as two women experience...
3,The Goat,Forced to marry one of the elders in her Egypt...
4,An Intruder Among Us,No Storyline Available
...,...,...
8731,Spoonful of Poison,This documentary exposes the devastating impac...
8732,StarGazer: The Need of the Many,A crew of space-travelers rescues a ship of re...
8733,Omo nero e Bucefalo in Kurdistan iracheno,A man and a motorbike who together traveled mo...
8734,Detroit Ransom,Parents of a 14-year-old prodigy stop at nothi...


1. Convert to lowercase

In [38]:
df['cleaned_storyline'] = df['Storyline'].str.lower()
df

Unnamed: 0,Movie Name,Storyline,cleaned_storyline
0,X Files,In the world there are thousands of mysteries ...,in the world there are thousands of mysteries ...
1,The Forest,A young man believes he is the last person ali...,a young man believes he is the last person ali...
2,The Buildout,A friendship is tested as two women experience...,a friendship is tested as two women experience...
3,The Goat,Forced to marry one of the elders in her Egypt...,forced to marry one of the elders in her egypt...
4,An Intruder Among Us,No Storyline Available,no storyline available
...,...,...,...
8731,Spoonful of Poison,This documentary exposes the devastating impac...,this documentary exposes the devastating impac...
8732,StarGazer: The Need of the Many,A crew of space-travelers rescues a ship of re...,a crew of space-travelers rescues a ship of re...
8733,Omo nero e Bucefalo in Kurdistan iracheno,A man and a motorbike who together traveled mo...,a man and a motorbike who together traveled mo...
8734,Detroit Ransom,Parents of a 14-year-old prodigy stop at nothi...,parents of a 14-year-old prodigy stop at nothi...


2. Punctuation removal

In [39]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [40]:
def remove_punctation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

In [41]:
df['cleaned_storyline'] = df['cleaned_storyline'].apply(remove_punctation)
df

Unnamed: 0,Movie Name,Storyline,cleaned_storyline
0,X Files,In the world there are thousands of mysteries ...,in the world there are thousands of mysteries ...
1,The Forest,A young man believes he is the last person ali...,a young man believes he is the last person ali...
2,The Buildout,A friendship is tested as two women experience...,a friendship is tested as two women experience...
3,The Goat,Forced to marry one of the elders in her Egypt...,forced to marry one of the elders in her egypt...
4,An Intruder Among Us,No Storyline Available,no storyline available
...,...,...,...
8731,Spoonful of Poison,This documentary exposes the devastating impac...,this documentary exposes the devastating impac...
8732,StarGazer: The Need of the Many,A crew of space-travelers rescues a ship of re...,a crew of spacetravelers rescues a ship of ref...
8733,Omo nero e Bucefalo in Kurdistan iracheno,A man and a motorbike who together traveled mo...,a man and a motorbike who together traveled mo...
8734,Detroit Ransom,Parents of a 14-year-old prodigy stop at nothi...,parents of a 14yearold prodigy stop at nothing...


3. Removal of Stopwords

In [42]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
STOPWORD = stopwords.words('english')
STOPWORD[:10]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Shanc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an']

In [43]:
def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word not in STOPWORD])

In [44]:
df['cleaned_storyline'] = df['cleaned_storyline'].apply(remove_stopwords)
df

Unnamed: 0,Movie Name,Storyline,cleaned_storyline
0,X Files,In the world there are thousands of mysteries ...,world thousands mysteries still unsolved probl...
1,The Forest,A young man believes he is the last person ali...,young man believes last person alive postapoca...
2,The Buildout,A friendship is tested as two women experience...,friendship tested two women experience somethi...
3,The Goat,Forced to marry one of the elders in her Egypt...,forced marry one elders egyptian village 12yea...
4,An Intruder Among Us,No Storyline Available,storyline available
...,...,...,...
8731,Spoonful of Poison,This documentary exposes the devastating impac...,documentary exposes devastating impact sugar o...
8732,StarGazer: The Need of the Many,A crew of space-travelers rescues a ship of re...,crew spacetravelers rescues ship refugees cert...
8733,Omo nero e Bucefalo in Kurdistan iracheno,A man and a motorbike who together traveled mo...,man motorbike together traveled half million k...
8734,Detroit Ransom,Parents of a 14-year-old prodigy stop at nothi...,parents 14yearold prodigy stop nothing rescue ...


4. Removal of Special characters

In [45]:
import re
def remove_spl_chars(text):
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    return text.strip()

In [46]:
df['cleaned_storyline'] = df['cleaned_storyline'].apply(remove_spl_chars)
df

Unnamed: 0,Movie Name,Storyline,cleaned_storyline
0,X Files,In the world there are thousands of mysteries ...,world thousands mysteries still unsolved probl...
1,The Forest,A young man believes he is the last person ali...,young man believes last person alive postapoca...
2,The Buildout,A friendship is tested as two women experience...,friendship tested two women experience somethi...
3,The Goat,Forced to marry one of the elders in her Egypt...,forced marry one elders egyptian village yearo...
4,An Intruder Among Us,No Storyline Available,storyline available
...,...,...,...
8731,Spoonful of Poison,This documentary exposes the devastating impac...,documentary exposes devastating impact sugar o...
8732,StarGazer: The Need of the Many,A crew of space-travelers rescues a ship of re...,crew spacetravelers rescues ship refugees cert...
8733,Omo nero e Bucefalo in Kurdistan iracheno,A man and a motorbike who together traveled mo...,man motorbike together traveled half million k...
8734,Detroit Ransom,Parents of a 14-year-old prodigy stop at nothi...,parents yearold prodigy stop nothing rescue da...


5. Stemming

In [47]:
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
def stemming(text):
    return ' '.join([stemmer.stem(word) for word in text.split()])

In [48]:
df['cleaned_storyline'] = df['cleaned_storyline'].apply(stemming)
df

Unnamed: 0,Movie Name,Storyline,cleaned_storyline
0,X Files,In the world there are thousands of mysteries ...,world thousand mysteri still unsolv problem my...
1,The Forest,A young man believes he is the last person ali...,young man believ last person aliv postapocalyp...
2,The Buildout,A friendship is tested as two women experience...,friendship test two women experi someth strang...
3,The Goat,Forced to marry one of the elders in her Egypt...,forc marri one elder egyptian villag yearold h...
4,An Intruder Among Us,No Storyline Available,storylin avail
...,...,...,...
8731,Spoonful of Poison,This documentary exposes the devastating impac...,documentari expos devast impact sugar oil cons...
8732,StarGazer: The Need of the Many,A crew of space-travelers rescues a ship of re...,crew spacetravel rescu ship refuge certain dea...
8733,Omo nero e Bucefalo in Kurdistan iracheno,A man and a motorbike who together traveled mo...,man motorbik togeth travel half million kilome...
8734,Detroit Ransom,Parents of a 14-year-old prodigy stop at nothi...,parent yearold prodigi stop noth rescu daughte...


6. Tokenization

In [49]:
import nltk
from nltk.tokenize import word_tokenize

# Ensure the tokenizer model is available
nltk.download('punkt')
nltk.download('punkt_tab')

def tokenize_words(text):
    """Tokenizes the given text into words."""
    return word_tokenize(text)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Shanc\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Shanc\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [50]:
df['cleaned_storyline'] = df['cleaned_storyline'].dropna().astype(str).apply(tokenize_words)
df.head(10)

Unnamed: 0,Movie Name,Storyline,cleaned_storyline
0,X Files,In the world there are thousands of mysteries ...,"[world, thousand, mysteri, still, unsolv, prob..."
1,The Forest,A young man believes he is the last person ali...,"[young, man, believ, last, person, aliv, posta..."
2,The Buildout,A friendship is tested as two women experience...,"[friendship, test, two, women, experi, someth,..."
3,The Goat,Forced to marry one of the elders in her Egypt...,"[forc, marri, one, elder, egyptian, villag, ye..."
4,An Intruder Among Us,No Storyline Available,"[storylin, avail]"
5,V13,"Vienna, 1913, Europe is on the brink of WWI. T...","[vienna, europ, brink, wwi, two, young, men, b..."
6,The Convent,Emilia is sent to a convent where paranormal f...,"[emilia, sent, convent, paranorm, forc, begin,..."
7,Norman's Rare Guitars Documentary,This world-renowned institution is where legen...,"[worldrenown, institut, legend, jam, find, ins..."
8,How to Solve Your Own Murder,"Kate, a troubled young women wakes up in a par...","[kate, troubl, young, women, wake, parallel, d..."
9,Mama's Boy,A controversial affair unfolds when a young ma...,"[controversi, affair, unfold, young, man, glen..."


7. Vectorization

In [51]:
from sklearn.feature_extraction.text import TfidfVectorizer
data = df['cleaned_storyline'].astype(str).tolist()  # Convert to list of strings

# Fit TF-IDF on the dataset
tfidf = TfidfVectorizer()
df_vectors = tfidf.fit_transform(data)

print(df_vectors.shape) # Print the shape of the resulting sparse matrix

(8736, 20512)


In [52]:
import pickle
# Save the Term Frequency-Inverse Document Frequency model
output_dir = r"D:\Guvi_Project\IMDB Movie Recommendation System Using Storylines\data"
filename = os.path.join(output_dir, 'tfidf.pkl')
with open(filename, 'wb') as f:
    pickle.dump(tfidf, f)

In [54]:
print(tfidf.get_feature_names_out()) # Get the first 10 feature names
df_vectors = df_vectors.toarray()  # Convert sparse matrix to dense array
print(df_vectors)

['aacharya' 'aadhi' 'aadi' ... 'zy' 'zylah' 'zyprec']
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [55]:
# Save the encoded data to the output directory

df_vector_csv = pd.DataFrame(df_vectors, columns=tfidf.get_feature_names_out())
vector_filepath = os.path.join(output_dir, "vector_data.csv")
df_vector_csv.to_csv(vector_filepath, index=False)

Sample 

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

user_input = ["This world-renowned institution is where legends jam, and find inspiration. This documentary chronicles the shop's unique spirit, as Norman Harris, its beloved founder, faces retirement, marking a turning point for this cultural haven"]

# Transform user input
user_vector = tfidf.transform(user_input)

# Compute similaritypo
similarity_scores = cosine_similarity(df_vectors, user_vector)

# Get the top 10 similar indices
nearest_idx = similarity_scores.flatten().argsort()[-10:][::-1]
recommendations = df.iloc[nearest_idx]

print(recommendations[['Movie Name', 'Storyline']])


                             Movie Name  \
7     Norman's Rare Guitars Documentary   
1395          The Doll Under the Stairs   
5026          Khelaghar Bandhte Legechi   
4454                  Music Shop Murthy   
1182                     Doctor Cerebro   
3406                               Park   
1649                      Saint-Exupéry   
2067    Weirdo: The Story of Five Eight   
3908               A Song from the Dark   
6768                            Kalikot   

                                              Storyline  
7     This world-renowned institution is where legen...  
1395  Norman has always had a fascination with dolls...  
5026  Founder of sweet shop Khirodasundari imparts w...  
4454  Murthy, who has been fond of music since child...  
1182  A group of rebels in Neo-Mendoza in 2037 embar...  
3406  A couple runs a DTH shop. After getting engage...  
1649  When Airmail's best pilot Henri Guillaumet dis...  
2067  After being institutionalized for schizophreni...  
3908