In [16]:
import pandas as pd
import re
# Load the Netflix and IMDb datasets
netflix_data_path = 'C:/Users/Prachi/Downloads/Netflix_data.csv'
imdb_data_path = 'C:/Users/Prachi/Downloads/IMDB Dataset.csv'
import pandas as pd

# Load the datasets
netflix_data = pd.read_csv(netflix_data_path)
imdb_dataset = pd.read_csv(imdb_data_path)

# Display the first few rows of each dataset to inspect the columns
print("Netflix Data Columns and Sample:")
print(netflix_data.head())
print(netflix_data.columns)

print("\nIMDb Data Columns and Sample:")
print(imdb_dataset.head())
print(imdb_dataset.columns)

# Insert here the modified script from the previous example if columns are confirmed
# Define a function to match titles and analyze sentiments
def analyze_sentiment(title, imdb_dataset):
    # Use regular expressions for case-insensitive search in reviews
    pattern = re.compile(re.escape(title), re.IGNORECASE)
    matched_reviews = imdb_dataset[imdb_dataset['Review'].apply(lambda x: bool(pattern.search(x)))]
    
    # Count sentiments if any reviews matched
    if not matched_reviews.empty:
        positive_count = (matched_reviews['sentiment'] == 'positive').sum()
        negative_count = (matched_reviews['sentiment'] == 'negative').sum()
        
        return positive_count - negative_count
    else:
        # Return None if no reviews found
        return None

# Apply the function and update Netflix data after confirming column names
netflix_data['Dominant_Sentiment'] = netflix_data['title'].apply(analyze_sentiment, imdb_dataset=imdb_dataset)
netflix_data.to_csv('C:/Users/Prachi/Downloads/Updated_Netflix_data.csv', index=False)



Netflix Data Columns and Sample:
         id                          title   type  \
0  tm1128292  Michael Che: Shame the Devil  MOVIE   
1   tm869475                      Pagglait  MOVIE   
2  tm1179579                 In Good Hands  MOVIE   
3  tm1040733       In Our Mothers' Gardens  MOVIE   
4  tm1080114                 Hostage House  MOVIE   

                                         description  release year  \
0  Michael Che returns to the stage in Oakland an...          2021   
1  Widowed soon after marriage, a young woman gra...          2021   
2  Diagnosed with a terminal illness, a single mo...          2022   
3  In Our Mothers' Gardens celebrates the strengt...          2021   
4  A realtor and her daughter are taken hostage b...          2021   

  a certificate  runtime                                     jonra country  \
0           NaN       58                                ['comedy']      []   
1           NaN      114                       ['comedy', 'drama']  ['I

In [19]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Drop any rows where 'Dominant_Sentiment' might be NaN after mapping if any
netflix_data.dropna(subset=['Dominant_Sentiment'], inplace=True)

# Calculate the correlation matrix
correlation_matrix = netflix_data[['imdb_scores', 'imdb_votes', 'tmdb_pop', 'tmdb_scores', 'Sentiments']].corr()

# Plot the heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

KeyError: "['imdb_score', 'imdb_votes', 'tmdb_pop', 'tmdb_score'] not in index"

In [5]:
# Attempt to reload the Netflix data with better understanding of delimiter and header row
# Also reloading the IMDb data with clearer delimiter identification

# Inspecting few lines to determine the correct delimiter
with open(netflix_data_path, 'r') as file:
    sample_lines = [next(file) for x in range(5)]

sample_lines


['id ,title,type,description,release year,a certificate,runtime,jonra,country,season,imdb id,imdb score,imdb votes,tmdb popularity,tmdb score\n',
 'tm1128292,Michael Che: Shame the Devil,MOVIE,"Michael Che returns to the stage in Oakland and tackles American patriotism, Black leadership, jealous exes, loose bears, mental health and more.",2021,,58,[\'comedy\'],[],,tt15829666,6.4,1091,3.478,7.2\n',
 'tm869475,Pagglait,MOVIE,"Widowed soon after marriage, a young woman grapples with an inability to grieve, quirky relatives, and a startling discovery about her late husband.",2021,,114,"[\'comedy\', \'drama\']",[\'IN\'],,tt11142762,6.9,9287,3.997,7.1\n',
 'tm1179579,In Good Hands,MOVIE,"Diagnosed with a terminal illness, a single mother encounters a suave bachelor as she grapples with the future of her headstrong six-year-old.",2022,,104,"[\'drama\', \'comedy\', \'romance\', \'family\']",[\'TR\'],,tt14898794,5.6,2618,8.29,6.7\n',
 'tm1040733,In Our Mothers\' Gardens,MOVIE,"In Our Mothers\' 

In [6]:
# Reload both datasets with improved parsing settings

# Reloading Netflix data with improved handling of quotes and commas
netflix_data = pd.read_csv(netflix_data_path, quotechar='"', delimiter=',', escapechar='\\')
netflix_titles = netflix_data.iloc[:, 1]  # Assuming the second column holds the movie titles based on previous output

# Check the headers of the IMDb data to understand the structure better
with open(imdb_data_path, 'r') as file:
    imdb_headers = next(file)

imdb_headers, imdb_data.head()


('"I am sick of series with young and clueless people, talking about their ""problems"" all the time, self centered, boring and absolutely annoying (Popular; Dawson\'s Creek; Beverly Hills; etc). ""Hack"" is a breath of fresh air, with a great actor (David Morse), a completely different plot, credible people with REAL problems (thank God !!) and very, very good histories. I just love it!! I hope ""Hack"" will go on for a long time, because it is a great television series for grown up people, for a change.",positive\n',
   I am sick of series with young and clueless people, talking about their "problems" all the time, self centered, boring and absolutely annoying (Popular; Dawson's Creek; Beverly Hills; etc). "Hack" is a breath of fresh air, with a great actor (David Morse), a completely different plot, credible people with REAL problems (thank God !!) and very, very good histories. I just love it!! I hope "Hack" will go on for a long time, because it is a great television series for gr

In [7]:
def analyze_reviews(netflix_titles, imdb_reviews, imdb_sentiments):
    results = []
    for title in netflix_titles:
        # Find reviews that contain the movie title
        matches = imdb_reviews.str.contains(title, case=False, na=False)
        matched_reviews = imdb_sentiments[matches]
        
        if matched_reviews.empty:
            results.append("")  # No reviews found, append an empty string
        else:
            # Count positive and negative reviews
            positive_count = (matched_reviews == "positive").sum()
            negative_count = (matched_reviews == "negative").sum()
            
            # Determine the predominant sentiment
            if positive_count > negative_count:
                results.append("positive")
            elif negative_count > positive_count:
                results.append("negative")
            else:
                results.append("equal")  # Both counts are equal
            
    return results

# Apply the function
netflix_data['Predominant_Sentiment'] = analyze_reviews(netflix_titles, imdb_data['review'], imdb_data['sentiment'])

# Display the updated Netflix data to verify the new column
netflix_data.head()


KeyError: 'review'