In [2]:
import pandas as pd
import re 
import numpy as np

In [3]:
data = pd.read_csv('imdb_2024.csv')
res = data.columns

#### Dropping columns that are not usefull for this case study.

In [4]:
data = data.drop(columns=['Home_Page', 'Overview', 'Cast', 'Storyline', 'Original_Language', 'Production_Company', 'Run_Time_Minutes', 'Release_Country', 'Tagline'], axis=1)

In [5]:
def to_str(fstring):
    new_string = ""
    for char in fstring:
        if char.isnumeric():
            new_string = new_string + char
        elif char =="M":
            return float(new_string[:-1]) * 1e6
        elif char == "B":
            return float(new_string[:-1]) * 1e9
    return float(new_string)

In [6]:
data["Budget"] = data["Budget_USD"].apply(to_str)
data["Revenue"] = data["Revenue_$"].apply(to_str)
# print(data.columns)

In [7]:
data = data.drop(columns=['Budget_USD', 'Revenue_$'])

In [9]:
data.loc[(data['Budget'] > 0) & (data['Revenue']>0), 'Profit'] = data['Revenue'] - data['Budget']
# print(data)

In [10]:
data = data[data['Profit'] != 0]
data = data.dropna()
print(data)

            Movie_Name                                             Genres  \
0            Nosferatu                   ['Fantasy', 'Horror', 'Mystery']   
1         Emilia Pérez  ['Comedy', 'Crime', 'Drama', 'Musical', 'Thril...   
2        The Brutalist                                          ['Drama']   
3       Saturday Night        ['Biography', 'Comedy', 'Drama', 'History']   
4        The Substance                      ['Drama', 'Horror', 'Sci-Fi']   
..                 ...                                                ...   
384             Afraid        ['Horror', 'Mystery', 'Sci-Fi', 'Thriller']   
385              Tarot                                         ['Horror']   
403  Lisa Frankenstein                                      ['Not_Found']   
432    Ordinary Angels                                      ['Not_Found']   
461       The Exorcism                             ['Horror', 'Thriller']   

    Release_Date  Vote_Average Vote_Count      Budget      Revenue  \
0    

## After cleaning up the we end up with a sample of 104 films released in 2024 that provide the necessary information needed for this project.

In [14]:
data = data.sort_values(by="Profit", ascending=False)
data.reset_index(drop=True, inplace=True)

## In the following steps we will be filtering the data based to the most common genre.

In [15]:
def top_genres(column):
    genres_dictionary = {}
    for entry in column:
        entry = re.sub(r"[\[\]']", "", entry)
        entry = entry.split(", ")
        for genre in entry:
            if genre in  genres_dictionary.keys():
                genres_dictionary[genre] += 1
            else:
                genres_dictionary[genre] = 1
    return genres_dictionary

In [16]:
successfull = data.head(20)
not_successfull = data.tail(20)
moderate_success = data.iloc[55:75]

In [17]:
genres = top_genres(not_successfull["Genres"])

popular_genres = pd.DataFrame.from_dict(genres, orient='index', columns=['Appearances'])
popular_genres = popular_genres.rename(columns={0: 'Genre'})
print(popular_genres.sort_values(by='Appearances', ascending=False))

           Appearances
Action              12
Thriller            10
Drama                7
Comedy               7
Fantasy              5
Crime                3
Mystery              3
Biography            2
Horror               2
Sci-Fi               2
Romance              2
Adventure            2
History              1
Western              1
War                  1
Music                1
Musical              1


In [18]:
total_genres_app = popular_genres['Appearances'].sum()
popular_genres['Frequency%'] = (popular_genres['Appearances'] / total_genres_app) * 100
print(popular_genres.sort_values(by='Frequency%', ascending=False))

           Appearances  Frequency%
Action              12   19.354839
Thriller            10   16.129032
Drama                7   11.290323
Comedy               7   11.290323
Fantasy              5    8.064516
Crime                3    4.838710
Mystery              3    4.838710
Biography            2    3.225806
Horror               2    3.225806
Sci-Fi               2    3.225806
Romance              2    3.225806
Adventure            2    3.225806
History              1    1.612903
Western              1    1.612903
War                  1    1.612903
Music                1    1.612903
Musical              1    1.612903


In [None]:
# successfull.to_excel('successful_genres.xlsx', index=False)
# moderate_success.to_excel('mod_s.xlsx', index=False)
# not_successfull.to_excel('not_s.xlsx', index=False)

In [23]:
high_rated = data.loc[data['Vote_Average'] >= 7.5]
mid_rated = data.loc[(data['Vote_Average'] >= 5.0) & (data['Vote_Average'] <= 7.4)]
low_rated = data.loc[data['Vote_Average'] <= 4.9]

high_rated.to_excel('high_rated.xlsx', index= False)
mid_rated.to_excel('mid_rated.xlsx', index=False)
low_rated.to_excel('low_rated.xlsx', index=False)

print(low_rated['Profit'].describe())

count    1.100000e+01
mean    -1.445448e+07
std      4.557272e+07
min     -1.060000e+08
25%     -2.200000e+07
50%     -1.099930e+07
75%      1.300000e+07
max      4.100000e+07
Name: Profit, dtype: float64
