In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
SAMPLE_DATA_PATH = "../../data/final/sample.csv"
FINAL_DATA_PATH = "../../data/final/final.csv"

In [3]:
N = 15000

In [4]:
data = pd.read_csv(FINAL_DATA_PATH)
print(data.head())

                         title  \
0                    Toy Story   
1                      Jumanji   
2             Grumpier Old Men   
3            Waiting to Exhale   
4  Father of the Bride Part II   

                                              genres  \
0  ['adventure', 'animation', 'children', 'comedy...   
1               ['adventure', 'children', 'fantasy']   
2                              ['comedy', 'romance']   
3                     ['comedy', 'drama', 'romance']   
4                                         ['comedy']   

                                      positive_users  positive_count  \
0  [1, 2, 7, 12, 24, 35, 42, 51, 54, 64, 72, 79, ...           50572   
1  [9, 41, 51, 73, 82, 101, 117, 177, 207, 210, 2...           10622   
2  [9, 41, 200, 314, 367, 473, 475, 540, 775, 940...            5152   
3  [260, 934, 1760, 2051, 2077, 2721, 2722, 2957,...             781   
4  [88, 196, 207, 214, 323, 385, 464, 473, 503, 5...            4407   

                         

In [5]:
profitable_movies = data[data["revenue"] > data["budget"]]
print("Number of profitable movies:", profitable_movies.shape[0])
print("Percentage of profitable movies: {:.2f}%".format((profitable_movies.shape[0] / data.shape[0]) * 100))

print('\nSampling {} movies from profitable movies.'.format(N//3))
profitable_movies_sample = profitable_movies.sample(n=N//3, random_state=42)
print('Shape of profitable movies sample:', profitable_movies_sample.shape)

Number of profitable movies: 8647
Percentage of profitable movies: 18.79%

Sampling 5000 movies from profitable movies.
Shape of profitable movies sample: (5000, 20)


In [6]:
unprofitable_movies = data[data["budget"] > data["revenue"]]

print("Number of unprofitable movies:", unprofitable_movies.shape[0])
print(
    "Percentage of unprofitable movies: {:.2f}%".format(
        (unprofitable_movies.shape[0] / data.shape[0]) * 100
    )
)

# Optional: take a sample
unprofitable_movies_sample = unprofitable_movies.sample(n=N // 5, random_state=42)
print("Shape of unprofitable movies sample:", unprofitable_movies_sample.shape)

Number of unprofitable movies: 6234
Percentage of unprofitable movies: 13.55%
Shape of unprofitable movies sample: (3000, 20)


In [7]:
# Combine previous samples to exclude them
excluded_movie_ids = set(profitable_movies_sample["tmdb_id"]) | set(unprofitable_movies_sample["tmdb_id"])
# Filter remaining movies
remaining_movies = data[
    (~data["tmdb_id"].isin(excluded_movie_ids))  # not in previous samples
    & (data["vote_average"] > 6.5)  # vote_average > 6.5
]

# Sample 5000 movies
remaining_sample = remaining_movies.sample(n=N//3, random_state=42)
print("Shape of remaining sample:", remaining_sample.shape)

Shape of remaining sample: (5000, 20)


In [8]:
# Combine all previously sampled movie_ids
all_sampled_ids = (
    set(profitable_movies_sample["tmdb_id"])
    | set(unprofitable_movies_sample["tmdb_id"])
    | set(remaining_sample["tmdb_id"])
)

# Filter remaining movies with vote_average between 3 and 6
low_rating_movies = data[
    (~data["tmdb_id"].isin(all_sampled_ids))
    & (data["vote_average"] >= 3)
    & (data["vote_average"] < 6)
]

# Sample 2000 movies
low_rating_sample = low_rating_movies.sample(n=2000, random_state=42)

print("Shape of low rating sample:", low_rating_sample.shape)

Shape of low rating sample: (2000, 20)


In [9]:
# Concatenate all samples
final_sampled_data = pd.concat(
    [
        profitable_movies_sample,
        unprofitable_movies_sample,
        remaining_sample,
        low_rating_sample,
    ],
    ignore_index=True,
)
# Shuffle the dataframe
final_sampled_data = final_sampled_data.sample(frac=1, random_state=42).reset_index(
    drop=True
)

In [10]:
print("Shape of final sampled dataframe:", final_sampled_data.shape)
print(final_sampled_data.head())

Shape of final sampled dataframe: (15000, 20)
                                               title  \
0  Much Ado About Nothing: Shakespeare's Globe Th...   
1                                       Piranhaconda   
2                                       Edge of Fury   
3                                   Bird of Paradise   
4                        Critters 2: The Main Course   

                              genres  \
0                ['comedy', 'drama']   
1               ['horror', 'sci-fi']   
2                       ['thriller']   
3  ['adventure', 'drama', 'romance']   
4     ['comedy', 'horror', 'sci-fi']   

                                      positive_users  positive_count  \
0                                    [61947, 172263]               2   
1                            [18184, 177617, 198781]               3   
2                                                 []               0   
3                                                 []               0   
4  [1109, 1429, 

In [11]:
rows, cols = final_sampled_data.shape
print("Final sampled data has {} rows and {} columns.".format(rows, cols))

Final sampled data has 15000 rows and 20 columns.


In [12]:
final_sampled_data.to_csv(SAMPLE_DATA_PATH, index=False)