In [None]:
import csv
import spacy
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.model_selection import train_test_split

# Download stopwords if not already downloaded
#nltk.download('stopwords')

def preprocess_text(text):
    # Perform text preprocessing steps
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and digits
    text = re.sub(r'\W+', ' ', text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = text.split()
    tokens = [token for token in tokens if token not in stop_words]
    # Join tokens back into a single string
    processed_text = ' '.join(tokens)
    return processed_text

nlp = spacy.load("en_core_web_sm")

def extract_movie_director(review_text):
    doc = nlp(review_text)
    director_name = None
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            director_name = ent.text
            break
    return director_name

def is_review_or_mention(review_text, movie_title):
    # Check if the movie title is mentioned in the review text
    if movie_title.lower() in review_text.lower():
        # Use SVM-based classifier to classify as review or mention
        processed_text = preprocess_text(review_text)
        class_label = classifier.predict(vectorizer.transform([processed_text]))[0]
        if class_label == 1:
            return "Review"
        else:
            return "Mention"
    else:
        return None

def train_classifier(dataset):
    # Extract review text and labels from the dataset
    texts = [review['Text'] for review in dataset]
    labels = [int(bool(re.search(r"\brating|review|score|opinion\b", review['Text'], re.I))) for review in dataset]

    # Vectorize the review texts using TF-IDF vectorizer
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(texts)

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

    # Train an SVM classifier
    classifier = svm.SVC()
    classifier.fit(X_train, y_train)

    # Evaluate classifier performance on the testing set
    accuracy = classifier.score(X_test, y_test)
    print("Classifier Accuracy:", accuracy)

    return classifier, vectorizer

def retrieve_and_extract_movies(dataset, movie_title):
    movie_reviews = []
    director_name = None
    review_ids = []
    mention_ids = []
    total_score = 0
    num_reviews = 0

    for review in dataset:
        review_text = review['Text']
        if movie_title.lower() in review_text.lower():
            movie_reviews.append(review)
            review_ids.append(review['Id'])
            total_score += float(review['Rating'])
            num_reviews += 1
            
            if director_name is None:
                director_name = extract_movie_director(review_text)
            
            mention = is_review_or_mention(review_text, movie_title)
            if mention == "Mention":
                mention_ids.append(review['Id'])

    if num_reviews > 0:
        average_score = total_score / num_reviews
    else:
        average_score = 0

    return movie_reviews, director_name, review_ids, mention_ids, average_score


dataset_path = "C:/Users/samue/Downloads/data_assessment_2/data_for_info_retriev_extract/Dataset_IMDB.csv"

movies_of_interest = ['The Lion King', 'Star Wars', 'Starship Troopers']

# Load dataset from CSV file
dataset = []
with open(dataset_path, 'r', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        dataset.append(row)

# Train the classifier
classifier, vectorizer = train_classifier(dataset)

for movie_title in movies_of_interest:
    movie_reviews, director_name, review_ids, mention_ids, average_score = retrieve_and_extract_movies(dataset, movie_title)

    print("Movie:", movie_title)
    print("Director:", director_name)
    print("Average Score:", average_score)
    print("Review IDs:", review_ids)
    print("Mention IDs:", mention_ids)
    print()



Classifier Accuracy: 0.8622754491017964
Movie: The Lion King
Director: Julie Taymor
Average Score: 0.6577272727272726
Review IDs: ['25950', '2727', '2236', '12429', '2262', '3098', '4464', '8264', '5513', '9778', '2758', '3163', '12950', '9837', '3768', '2661', '9751', '2261', '7218', '5691', '24533', '12936', '18866', '23805', '2670', '7836', '9703', '3532', '4981', '15087', '23535', '10023', '13316', '22231', '3558', '3568', '3792', '5790', '8103', '8287', '9609', '18890', '3535', '5369']
Mention IDs: ['2236', '12429', '2262', '3098', '4464', '9778', '3163', '12950', '9837', '3768', '9751', '2261', '15087', '23535', '10023', '13316', '22231', '3792', '8287', '9609', '18890']

Movie: Star Wars
Director: John Waters
Average Score: 0.6225641025641026
Review IDs: ['26895', '22584', '24561', '18485', '20185', '4550', '7309', '12547', '2985', '2582', '12057', '3785', '5574', '3023', '2035', '4125', '11641', '4634', '7065', '3742', '7015', '7096', '7624', '11025', '5616', '6782', '6949', '7

In [24]:
#Text to speech
import pyttsx3

# Initialize the text-to-speech engine
engine = pyttsx3.init()

# Define a function to generate the voice message
def generate_voice_message(movie_title, director, num_reviews, num_mentions, avg_score):
    # Define the message template
    message = f"You have required information about {movie_title} movie, the director of this movie is {director}, there is a total of {num_reviews} reviews of this movie in the database, there are {num_mentions} mentions of this movie in other movie reviews. The average review score for this movie is {avg_score}, which makes it a {'BAD' if avg_score >= 6 else 'GOOD'} recommendation to watch."
    # Set the properties of the voice message
    engine.setProperty('rate', 150) # Set the speaking rate
    engine.setProperty('volume', 1) # Set the volume
    # Generate the voice message
    engine.say(message)
    engine.runAndWait()

# Call the function to generate the voice message
generate_voice_message("Starship Troopers", "Paul Verhoeven", 17, 12, 0.59)


In [10]:
#F1 score of the system to extract IDs of reviews Starship Troopers
ground_truth = ['9654', '9671', '9673', '17219']
id_retrieve = ['17219', '26255', '10859', '12224', '11808', '9654', '11987', '9671', '22372', '11595', '12012', '14580', '11468', '13629', '9915', '13316', '25567', '9673']

# Calculate true positives, false positives, and false negatives
true_positives = len(set(ground_truth) & set(id_retrieve))
false_positives = len(set(id_retrieve) - set(ground_truth))
false_negatives = len(set(ground_truth) - set(id_retrieve))

# Calculate precision
precision = true_positives / (true_positives + false_positives)

# Calculate recall
recall = true_positives / (true_positives + false_negatives)

# Calculate F1 score
f1 = 2 * ((precision * recall) / (precision + recall))

print("Precision IDs of reviews Starship Troopers:", precision)
print("Recall IDs of reviews Starship Troopers:", recall)
print("F1 score: IDs of reviews Starship Troopers", f1)

#F1 score of the system to extract IDs of mentions Starship Troopers
ground_truth = [9915, 10859, 11468, 11595, 11808, 11987, 12012, 12224, 13316, 13629, 14580, 22372, 25567, 26255]
id_retrieval = ['10859', '11808', '9654', '11987', '11595', '12012', '14580', '11468', '13629', '9915', '13316', '9673']

# Convert the ID retrieval list to integers
id_retrieval = list(map(int, id_retrieval))

# Calculate True Positives (TP), False Positives (FP), and False Negatives (FN)
tp = len(set(ground_truth).intersection(id_retrieval))
fp = len(id_retrieval) - tp
fn = len(ground_truth) - tp

# Calculate precision, recall, and F1-score
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2 * (precision * recall) / (precision + recall)

print("Precision IDs of mentions Starship Troopers:", precision)
print("Recall IDs of mentions Starship Troopers:", recall)
print("F1-score IDs of mentions Starship Troopers:", f1_score)


Precision IDs of reviews Starship Troopers: 0.2222222222222222
Recall IDs of reviews Starship Troopers: 1.0
F1 score: IDs of reviews Starship Troopers 0.3636363636363636
Precision IDs of mentions Starship Troopers: 0.8333333333333334
Recall IDs of mentions Starship Troopers: 0.7142857142857143
F1-score IDs of mentions Starship Troopers: 0.7692307692307692


In [12]:
#F1 score of the system to extract IDs of reviews Star Wars
ground_truth = [5049, 5055, 5100, 6782, 6829, 6949, 6979, 7065, 7114, 18265, 18433, 18485, 18767]
id_retrieval = ['26895', '22584', '24561', '18485', '20185', '4550', '7309', '12547', '2985', '2582', '12057', '3785', '5574', '3023', '2035', '4125', '11641', '4634', '7065', '3742',
                '7015', '7096', '7624', '11025', '5616', '6782', '6949', '7676', '16619', '24553', '2483', '25020', '5542', '18433', '18767', '3040', '7114', '7533', '6829', '6979',
                '7197', '12890', '15031', '21544', '24551', '24563', '12602', '27812', '28755', '28772', '6417', '9638', '10230', '11468', '13536', '13636', '21610', '6383', '18265', 
                '18779', '22991', '25302', '25766', '3852', '6348', '7163', '8245', '10897', '20096', '28787', '5100', '6971', '7536', '29716', '5049', '5055', '5592', '7212']

# Convert the ID retrieval list to integers
id_retrieval = list(map(int, id_retrieval))

# Calculate True Positives (TP), False Positives (FP), and False Negatives (FN)
tp = len(set(ground_truth).intersection(id_retrieval))
fp = len(id_retrieval) - tp
fn = len(ground_truth) - tp

# Calculate precision, recall, and F1-score
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2 * (precision * recall) / (precision + recall)

print("Precision IDs of reviews Star Wars:", precision)
print("Recall IDs of reviews Star Wars:", recall)
print("F1-score: IDs of reviews Star Wars", f1_score)


#F1 score of the system to extract IDs of mentions Star Wars
ground_truth = [2035, 2483, 2582, 2985, 3023, 3040, 3742, 3785, 3852, 4125, 4550, 4634, 5542, 5574, 5592, 5616, 6348, 6383, 6417, 6971, 7015, 7096, 7163, 7197, 7212, 7309, 7533, 7536,
                7624, 7676, 8245, 9638, 10230, 10897, 11025, 11468, 11641, 12057, 12547, 12602, 12890, 13536, 13636, 15031, 16619, 18779, 20096, 20185, 21544, 21610, 22584, 22991,
                24551, 24553, 24561, 24563, 25020, 25302, 25766, 26895, 27812, 28755, 28772, 28787, 29716]
id_retrieval = ['4550', '7309', '12547', '2582', '12057', '3785', '3023', '2035', '4125', '11641', '7065', '7015', '7096', '7624', '11025', '5616', '7676', '3040', '12890', '15031', 
                '21544', '24551', '24563', '12602', '6417', '9638', '10230', '11468', '13536', '13636', '21610', '18265', '18779', '22991', '25302', '25766', '3852', '10897', '20096', '7212']

# Convert the ID retrieval list to integers
id_retrieval = list(map(int, id_retrieval))

# Calculate True Positives (TP), False Positives (FP), and False Negatives (FN)
tp = len(set(ground_truth).intersection(id_retrieval))
fp = len(id_retrieval) - tp
fn = len(ground_truth) - tp

# Calculate precision, recall, and F1-score
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2 * (precision * recall) / (precision + recall)

print("Precision IDs of mentions Star Wars:", precision)
print("Recall IDs of mentions Star Wars:", recall)
print("F1-score IDs of mentions Star Wars:", f1_score)


Precision IDs of reviews Star Wars: 0.16666666666666666
Recall IDs of reviews Star Wars: 1.0
F1-score: IDs of reviews Star Wars 0.2857142857142857
Precision IDs of mentions Star Wars: 0.95
Recall IDs of mentions Star Wars: 0.5846153846153846
F1-score IDs of mentions Star Wars: 0.7238095238095238


In [13]:
#F1 score of the system to extract IDs of reviews The Lion King
ground_truth = [2661, 2670, 3558, 15087]
id_retrieval = ['25950', '2727', '2236', '12429', '2262', '3098', '4464', '8264', '5513', '9778', '2758', '3163', '12950', '9837', '3768', '2661', '9751', '2261', '7218', '5691', '24533',
                '12936', '18866', '23805', '2670', '7836', '9703', '3532', '4981', '15087', '23535', '10023', '13316', '22231', '3558', '3568', '3792', '5790', '8103', '8287', '9609', 
                '18890', '3535', '5369']

# Convert the ID retrieval list to integers
id_retrieval = list(map(int, id_retrieval))

# Calculate True Positives (TP), False Positives (FP), and False Negatives (FN)
tp = len(set(ground_truth).intersection(id_retrieval))
fp = len(id_retrieval) - tp
fn = len(ground_truth) - tp

# Calculate precision, recall, and F1-score
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2 * (precision * recall) / (precision + recall)

print("Precision IDs of reviews The Lion King:", precision)
print("Recall IDs of reviews The Lion King:", recall)
print("F1-score IDs of reviews The Lion King:", f1_score)



#F1 score of the system to extract IDs of mentions The Lion King
ground_truth = [2236, 2261, 2262, 2727, 2758, 3098, 3163, 3532, 3535, 3568, 3768, 3792, 4464, 4981, 5369, 5513, 5691, 5790, 7218, 7836, 8103, 8264, 8287, 9609, 9703, 9751, 9778, 9837, 10023,
                12429, 12936, 12950, 13316, 18866, 18890, 22231, 23535, 23805, 24533, 25950]
id_retrieval = ['2236', '12429', '2262', '3098', '4464', '9778', '3163', '12950', '9837', '3768', '9751', '2261', '15087', '23535', '10023', '13316', '22231', '3792', '8287', '9609', '18890']

# Convert the ID retrieval list to integers
id_retrieval = list(map(int, id_retrieval))

# Calculate True Positives (TP), False Positives (FP), and False Negatives (FN)
tp = len(set(ground_truth).intersection(id_retrieval))
fp = len(id_retrieval) - tp
fn = len(ground_truth) - tp

# Calculate precision, recall, and F1-score
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2 * (precision * recall) / (precision + recall)

print("Precision IDs of mentions The Lion King:", precision)
print("Recall IDs of mentions The Lion King:", recall)
print("F1-score IDs of mentions The Lion King:", f1_score)



Precision IDs of reviews The Lion King: 0.09090909090909091
Recall IDs of reviews The Lion King: 1.0
F1-score IDs of reviews The Lion King: 0.16666666666666669
Precision IDs of mentions The Lion King: 0.9523809523809523
Recall IDs of mentions The Lion King: 0.5
F1-score IDs of mentions The Lion King: 0.6557377049180327


In [23]:
import pandas as pd

# Define the result data as a dictionary
result_data = {
    'Movie': ['Starship Troopers', 'Star Wars', 'The Lion King'],
    'Precision IDs of Reviews (SVM)': [0.222, 0.167, 0.091],
    'Recall IDs of Reviews (SVM)': [1.0, 1.0, 1.0],
    'F1-score IDs of Reviews (SVM)': [0.364, 0.286, 0.167],
    'Precision IDs of Mentions (SVM)': [0.833, 0.95, 0.952],
    'Recall IDs of Mentions (SVM)': [0.714, 0.585, 0.5],
    'F1-score IDs of Mentions (SVM)': [0.769, 0.724, 0.656],
    'Precision IDs of Reviews (Rule-based)': [0.222, 0.275, 0.25],
    'Recall IDs of Reviews (Rule-based)': [1.0, 0.84, 0.041],
    'F1-score IDs of Reviews (Rule-based)': [0.364, 0.415, 0.070],
    'Precision IDs of Mentions (Rule-based)': [0.833, 0.947, 0.425],
    'Recall IDs of Mentions (Rule-based)': [0.714, 0.553, 0.85],
    'F1-score IDs of Mentions (Rule-based)': [0.769, 0.698, 0.567]
}

# Create a DataFrame from the result data
df = pd.DataFrame(result_data)

# Display the DataFrame as a table
df


Unnamed: 0,Movie,Precision IDs of Reviews (SVM),Recall IDs of Reviews (SVM),F1-score IDs of Reviews (SVM),Precision IDs of Mentions (SVM),Recall IDs of Mentions (SVM),F1-score IDs of Mentions (SVM),Precision IDs of Reviews (Rule-based),Recall IDs of Reviews (Rule-based),F1-score IDs of Reviews (Rule-based),Precision IDs of Mentions (Rule-based),Recall IDs of Mentions (Rule-based),F1-score IDs of Mentions (Rule-based)
0,Starship Troopers,0.222,1.0,0.364,0.833,0.714,0.769,0.222,1.0,0.364,0.833,0.714,0.769
1,Star Wars,0.167,1.0,0.286,0.95,0.585,0.724,0.275,0.84,0.415,0.947,0.553,0.698
2,The Lion King,0.091,1.0,0.167,0.952,0.5,0.656,0.25,0.041,0.07,0.425,0.85,0.567
