<a href="https://colab.research.google.com/github/RinoaHime/Bedrock-course3/blob/main/Marlissa_chingu.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Title


In [None]:
import pandas as pd
from google.colab import drive
import random
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# File path for user_reviews
file_path = '/content/drive/MyDrive/chingu_datasets/user_reviews.csv'

# Define chunk size for reading in parts
chunk_size = 100000  # Smaller chunk size for less memory usage
target_sample_size = 1000000  # 1 million rows

# Set random seed for reproducibility
random_seed = 42

# Initialize an empty dataframe for the sampled data
sampled_reviews = pd.DataFrame()

# Iterate through the dataset in chunks
for chunk in pd.read_csv(file_path, chunksize=chunk_size):
    # Sample a portion from each chunk, setting a random seed
    sampled_chunk = chunk.sample(frac=min(1, (target_sample_size - len(sampled_reviews)) / len(chunk)), random_state=random_seed)

    # Append the sampled chunk to the result dataframe
    sampled_reviews = pd.concat([sampled_reviews, sampled_chunk], ignore_index=True)

    # Stop when we've sampled enough rows
    if len(sampled_reviews) >= target_sample_size:
        break

# Trim down to exactly 1 million rows
sampled_reviews = sampled_reviews.iloc[:target_sample_size]

# Save the sampled dataset
sampled_reviews.to_csv('/content/drive/MyDrive/chingu_datasets/sampled_user_reviews_1m_optimized.csv', index=False)




In [None]:
critic_reviews = pd.read_csv('/content/drive/MyDrive/chingu_datasets/critic_reviews.csv')
movies = pd.read_csv('/content/drive/MyDrive/chingu_datasets/movies.csv')
sampled_user_reviews = pd.read_csv('/content/drive/MyDrive/chingu_datasets/sampled_user_reviews_1m_optimized.csv')

print('Critic Reviews shape:', critic_reviews.shape)
print('Movies shape:', movies.shape)
print('User Reviews shape:', sampled_user_reviews.shape)

  critic_reviews = pd.read_csv('/content/drive/MyDrive/chingu_datasets/critic_reviews.csv')


Critic Reviews shape: (967564, 16)
Movies shape: (10233, 13)
User Reviews shape: (1000000, 13)


# Datasets

## `critic_reviews`

#### Data Dictionary for `critic_reviews`

| Column Name       | Data Type | Description                                                                                         |
| ----------------- | --------- | --------------------------------------------------------------------------------------------------- |
| `reviewId`        | int64     | Unique identifier for each critic's review.                                                         |
| `movieId`         | object    | Unique identifier for each movie.                                                                   |
| `creationDate`    | object    | Date when the review was created.                                                                   |
| `criticName`      | object    | Name of the critic who wrote the review.                                                            |
| `criticPageUrl`   | object    | URL to the critic's profile or review page.                                                         |
| `reviewState`     | object    | Status of the review (e.g., "published", "draft").                                                  |
| `isFresh`         | bool      | Indicates whether the review is considered "fresh" (positive).                                      |
| `isRotten`        | bool      | Indicates whether the review is considered "rotten" (negative).                                     |
| `isRtUrl`         | object    | Indicates whether there is a URL link to the full review on Rotten Tomatoes                                  |
| `isTopCritic`     | bool      | Indicates whether the review is from a top critic.                                                  |
| `publicationUrl`  | object    | URL of the publication where the review was published.                                              |
| `publicationName` | object    | Name of the publication that published the review.                                                  |
| `reviewUrl`       | object    | URL link to the full review (if available).                                                         |
| `quote`           | object    | A short excerpt or quote from the review.                                                           |
| `scoreSentiment`  | object    | The sentiment of the review score (e.g., positive, neutral, negative).                              |
| `originalScore`   | object    | The original score given by the critic |

In [None]:
critic_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 967564 entries, 0 to 967563
Data columns (total 16 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   reviewId         967564 non-null  int64 
 1   movieId          967564 non-null  object
 2   creationDate     967564 non-null  object
 3   criticName       958112 non-null  object
 4   criticPageUrl    958112 non-null  object
 5   reviewState      967564 non-null  object
 6   isFresh          967564 non-null  bool  
 7   isRotten         967564 non-null  bool  
 8   isRtUrl          911079 non-null  object
 9   isTopCritic      967564 non-null  bool  
 10  publicationUrl   967564 non-null  object
 11  publicationName  967564 non-null  object
 12  reviewUrl        849906 non-null  object
 13  quote            934601 non-null  object
 14  scoreSentiment   967564 non-null  object
 15  originalScore    693527 non-null  object
dtypes: bool(3), int64(1), object(12)
memory usage: 98.7+ MB


In [None]:
critic_reviews.head()

Unnamed: 0,reviewId,movieId,creationDate,criticName,criticPageUrl,reviewState,isFresh,isRotten,isRtUrl,isTopCritic,publicationUrl,publicationName,reviewUrl,quote,scoreSentiment,originalScore
0,1913967,16db6b7e-176a-3a45-a31f-158a09f94630,1800-01-01,Jonathan Rosenbaum,/critics/jonathan-rosenbaum,fresh,True,False,False,True,/critics/source/66,Chicago Reader,http://www.chicagoreader.com/chicago/the-adven...,Terry Gilliam's third fantasy feature may not ...,POSITIVE,
1,1906428,29998351-00a7-31ca-b616-6436ffe438e5,1800-01-01,Jennie Kermode,/critics/jennie-kermode,fresh,True,False,False,False,/critics/source/1869,Eye for Film,http://www.eyeforfilm.co.uk/reviews.php?id=7968,,POSITIVE,3.5/5
2,1902262,34bf7e79-d110-3b33-93f5-d1e3b79aea10,1800-01-01,Owen Gleiberman,/critics/owen-gleiberman,fresh,True,False,False,True,/critics/source/150,Entertainment Weekly,https://ew.com/article/1993/10/15/movie-review...,"Weir, working from a script by Rafael Yglesias...",POSITIVE,B
3,1897051,3aeb7064-f73d-32c1-8432-94babc14e6f1,1800-01-01,Owen Gleiberman,/critics/owen-gleiberman,fresh,True,False,False,True,/critics/source/150,Entertainment Weekly,"http://www.ew.com/ew/article/0,,309271,00.html","Coming out from behind Spike Lee's camera, Ern...",POSITIVE,B+
4,1909547,3bd845e8-2e38-3504-9682-45bede2bb83c,1800-01-01,,,rotten,False,True,False,True,/critics/source/150,Entertainment Weekly,"http://www.ew.com/ew/article/0,,318442,00.html",,NEGATIVE,D+


In [None]:
critic_reviews['isRtUrl'].value_counts()

Unnamed: 0_level_0,count
isRtUrl,Unnamed: 1_level_1
False,910867
True,212


## `movies`

#### Data Dictionary for `movies`

| Column Name               | Data Type | Description                                                                                           |
| ------------------------- | --------- | ----------------------------------------------------------------------------------------------------- |
| `movieId`                 | object    | Unique identifier for each movie.                                                                     |
| `movieYear`               | int64     | The year the movie was released.                                                                      |
| `movieURL`                | object    | URL to the movie's page on Rotten Tomatoes.                                                           |
| `movieTitle`              | object    | The title of the movie.                                                                               |
| `critic_score`            | float64   | Average score given by critics for the movie.                                                         |
| `critic_sentiment`        | object    | Sentiment of the critic's reviews (e.g., positive, negative, neutral).                                 |
| `audience_score`          | float64   | Average score given by audience members for the movie.                                                |
| `audience_sentiment`      | object    | Sentiment of audience reviews (e.g., positive, negative, neutral).                                     |
| `release_date_theaters`   | object    | The release date of the movie in theaters.                                                            |
| `release_date_streaming`  | object    | The release date of the movie on streaming platforms.                                                 |
| `rating`                  | object    | The movie's rating (e.g., PG, PG-13, R).                                                              |
| `original_language`       | object    | The original language of the movie (e.g., English, French).                                           |
| `runtime`                 | object    | The runtime of the movie in minutes (may be stored as text in some cases, e.g., "120 minutes").       |

In [None]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10233 entries, 0 to 10232
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   movieId                 10233 non-null  object 
 1   movieYear               10233 non-null  int64  
 2   movieURL                10233 non-null  object 
 3   movieTitle              10233 non-null  object 
 4   critic_score            9738 non-null   float64
 5   critic_sentiment        9738 non-null   object 
 6   audience_score          10144 non-null  float64
 7   audience_sentiment      10144 non-null  object 
 8   release_date_theaters   7928 non-null   object 
 9   release_date_streaming  10233 non-null  object 
 10  rating                  7758 non-null   object 
 11  original_language       10214 non-null  object 
 12  runtime                 10216 non-null  object 
dtypes: float64(2), int64(1), object(10)
memory usage: 1.0+ MB


In [None]:
movies.head()

Unnamed: 0,movieId,movieYear,movieURL,movieTitle,critic_score,critic_sentiment,audience_score,audience_sentiment,release_date_theaters,release_date_streaming,rating,original_language,runtime
0,281004c8-bbc3-3522-8246-26ee44469bb4,1902,https://www.rottentomatoes.com/m/le_voyage_dan...,A Trip to the Moon,100.0,positive,90.0,positive,"Oct 4, 1902, Original","Aug 27, 2016",,French (France),14m
1,ac173b27-b71a-34b3-9888-5304a0e165e0,1915,https://www.rottentomatoes.com/m/birth_of_a_na...,The Birth of a Nation,91.0,positive,47.0,negative,"Mar 3, 1915, Wide","Jul 8, 2016",,,3h 10m
2,96f91c04-5e32-39b2-805f-4c1d1bcb3b1b,1921,https://www.rottentomatoes.com/m/the_cabinet_o...,The Cabinet of Dr. Caligari,96.0,positive,89.0,positive,"Mar 19, 1921, Wide","Mar 22, 2016",,German,1h 9m
3,b70c2dc6-41e7-3240-a38f-5f5e5018eeb1,1921,https://www.rottentomatoes.com/m/1052609-kid,The Kid,100.0,positive,95.0,positive,"Jan 21, 1921, Original","Sep 2, 2016",,English,1h 0m
4,13101368-55d8-30a1-9d41-4271211defbb,1922,https://www.rottentomatoes.com/m/nosferatu,Nosferatu,97.0,positive,87.0,positive,"Mar 5, 1922, Original","Jul 15, 2008",,German,1h 5m


In [None]:
movies['movieURL'].value_counts()

Unnamed: 0_level_0,count
movieURL,Unnamed: 1_level_1
https://www.rottentomatoes.com/m/le_voyage_dans_la_lune,1
https://www.rottentomatoes.com/m/hichki,1
https://www.rottentomatoes.com/m/alex_strangelove,1
https://www.rottentomatoes.com/m/i_kill_giants,1
https://www.rottentomatoes.com/m/destination_wedding,1
...,...
https://www.rottentomatoes.com/m/birdemic_shock_and_terror-2008,1
https://www.rottentomatoes.com/m/curious_case_of_benjamin_button,1
https://www.rottentomatoes.com/m/clone_wars,1
https://www.rottentomatoes.com/m/seven_pounds,1


## `user_reviews`

#### Data Dictionary for `user_reviews`

| Column Name       | Data Type | Description                                                                                                 |
| ----------------- | --------- | ----------------------------------------------------------------------------------------------------------- |
| `movieId`         | object    | Unique identifier for each movie.                                                                           |
| `rating`          | float64   | The rating provided by the user for the movie.                                                              |
| `quote`           | object    | The text content of the user review (a short quote or description of their thoughts).                       |
| `reviewId`        | object    | Unique identifier for each user review (may be null or missing in some cases).                              |
| `isVerified`      | bool      | Indicates whether the review was written by a verified user.                                                |
| `isSuperReviewer` | bool      | Indicates whether the user is a "super reviewer" (a frequent or influential reviewer).                      |
| `hasSpoilers`     | bool      | Indicates whether the review contains spoilers.                                                             |
| `hasProfanity`    | bool      | Indicates whether the review contains profanity.                                                            |
| `score`           | float64   | The numeric score (possibly the same as the rating) given by the user.                                      |
| `creationDate`    | object    | The date when the user review was created.                                                                  |
| `userDisplayName` | object    | The display name of the user (may be missing or null in some cases).                                        |
| `userRealm`       | object    | A field indicating the platform or realm of the user (e.g., "RT" for Rotten Tomatoes).                      |
| `userId`          | object    | Unique identifier for the user who wrote the review.                                                        |

In [None]:
sampled_user_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 13 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   movieId          1000000 non-null  object 
 1   rating           1000000 non-null  float64
 2   quote            1000000 non-null  object 
 3   reviewId         2884 non-null     object 
 4   isVerified       1000000 non-null  bool   
 5   isSuperReviewer  1000000 non-null  bool   
 6   hasSpoilers      1000000 non-null  bool   
 7   hasProfanity     1000000 non-null  bool   
 8   score            1000000 non-null  float64
 9   creationDate     1000000 non-null  object 
 10  userDisplayName  1550 non-null     object 
 11  userRealm        1000000 non-null  object 
 12  userId           1000000 non-null  object 
dtypes: bool(4), float64(2), object(7)
memory usage: 72.5+ MB


In [None]:
sampled_user_reviews.head()

Unnamed: 0,movieId,rating,quote,reviewId,isVerified,isSuperReviewer,hasSpoilers,hasProfanity,score,creationDate,userDisplayName,userRealm,userId
0,044acdfc-f685-338b-b465-d5405e5298c0,5.0,Tarantino`s great crime epic. Everyone in the ...,,False,False,False,False,5.0,2005-10-15,,RT,900809275
1,a67d1d59-e02f-35e0-9c53-b82e80799111,1.0,Unenjoyable for anyone over the age of RETARDED.,,False,False,False,False,1.0,2005-11-22,,RT,900699279
2,85a99b8b-4724-38b6-a5d0-23c0161a3ce5,3.5,Other people go outside and smoke when they ta...,,False,False,False,False,3.5,2004-06-30,,RT,900659533
3,e2bff218-467b-3595-ae11-c62d690641ad,5.0,"If you have seen this movie, then no words nee...",,False,False,False,False,5.0,2005-10-22,,RT,900767626
4,e17a5735-c633-3e81-91a2-dd5b500931f7,5.0,YAY YAH I COULD WATCH THIS WITH MARK LOL\r\n,,False,False,False,False,5.0,2005-12-22,,RT,1249927
