In [26]:
import pandas as pd
import matplotlib.pyplot as plt
import altair as alt

In [27]:
df = pd.read_csv('2022 Movies.csv')
df.head()

Unnamed: 0,Year,Name,Genre,IMDB,Rotten Tomatoes,Rotten Audience,Metacritic,Google_user,PR,Country,Language,Comic,Animation,Tv_series,Budget,Box_office,Run_time,Month
0,2021,The Last Duel,Drama/Historical Drama,7.4,85,81,67,73,7.0,UK,English,no,no,no,100.0,30.6,153,January
1,2021,Ghostbusters: Afterlife,Fantasy/Comedy,7.1,63,94,45,76,6.5,USA,English,no,no,no,75.0,204.4,124,January
2,2021,Hawkeye,Action/Superhero,7.5,88,89,66,89,7.5,USA,English,yes,no,yes,150.0,0.0,40-62,January
3,2021,Operation Java,Crime/Drama,8.2,0,0,0,96,7.5,India,Malayalam,no,no,no,0.0,0.0,146,January
4,2021,Eternals,Action/Superhero,6.3,47,77,52,71,6.5,USA,English,yes,no,no,200.0,402.0,156,January


### Checking null value

In [28]:
df.isnull().values.any()

False

### After loading the dataset the important thing is to get an idea of the data. That is, what kind of columns are there in the data, how many values are there, etc.


In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 181 entries, 0 to 180
Data columns (total 18 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Year             181 non-null    int64  
 1   Name             181 non-null    object 
 2   Genre            181 non-null    object 
 3   IMDB             181 non-null    float64
 4   Rotten Tomatoes  181 non-null    int64  
 5   Rotten Audience  181 non-null    int64  
 6   Metacritic       181 non-null    int64  
 7   Google_user      181 non-null    int64  
 8   PR               181 non-null    float64
 9   Country          181 non-null    object 
 10  Language         181 non-null    object 
 11  Comic            181 non-null    object 
 12  Animation        181 non-null    object 
 13  Tv_series        181 non-null    object 
 14  Budget           181 non-null    float64
 15  Box_office       181 non-null    float64
 16  Run_time         181 non-null    object 
 17  Month           

### Describe the dataset or Descriptive statistics

In [30]:
df.describe()

Unnamed: 0,Year,IMDB,Rotten Tomatoes,Rotten Audience,Metacritic,Google_user,PR,Budget,Box_office
count,181.0,181.0,181.0,181.0,181.0,181.0,181.0,181.0,181.0
mean,2015.61326,6.919337,56.690608,63.116022,34.679558,80.044199,6.825967,41.144696,95.236115
std,8.49213,0.753445,33.486703,28.739304,31.509735,10.821698,0.857218,64.200114,176.781253
min,1963.0,4.9,0.0,0.0,0.0,50.0,5.0,0.0,0.0
25%,2012.0,6.4,40.0,54.0,0.0,72.0,6.0,0.0,0.0
50%,2019.0,6.9,66.0,72.0,46.0,81.0,7.0,9.0,11.2
75%,2022.0,7.4,85.0,85.0,63.0,88.0,7.5,60.0,104.4
max,2022.0,9.2,100.0,100.0,86.0,97.0,9.0,340.0,1004.0


In [31]:
# we can break down descriptive statistics functions

print('Average Budget:', df['Budget'].mean())
print('Standard Deviation: ', df['Budget'].std())

Average Budget: 41.144696132596685
Standard Deviation:  64.20011412711762


### Data Filtering

In [32]:
# percentage of each country

round(df['Country'].value_counts()/df['Country'].count()*100,2)

USA            51.93
India          18.78
South Korea    10.50
UK              5.52
Indonesia       2.76
Japan           2.76
France          2.21
Germany         1.10
Canada          0.55
Turkish         0.55
Hong Kong       0.55
USA             0.55
China           0.55
Romania         0.55
UK              0.55
Russia          0.55
Name: Country, dtype: float64

In [33]:
# percentage of language

round(df['Language'].value_counts()/df['Language'].count()*100,2)

English       62.43
Korean        10.50
Hindi          9.39
Malayalam      4.97
Tamil          3.87
Indonesian     2.76
Japanese       2.76
French         0.55
Turkey         0.55
Cantonese      0.55
Mandarin       0.55
Russian        0.55
Telugu         0.55
Name: Language, dtype: float64

In [34]:
# number of movies per month

df['Month'].value_counts()

September    23
December     23
May          19
April        17
March        16
October      16
november     16
February     15
January      12
June         11
August        8
July          5
Name: Month, dtype: int64

In [35]:
# movies are rated above 80 on Rotten Tomatoes and Metacritic 

df[(df['Rotten Tomatoes'] >= 80) & (df['Metacritic'] >= 80)]

Unnamed: 0,Year,Name,Genre,IMDB,Rotten Tomatoes,Rotten Audience,Metacritic,Google_user,PR,Country,Language,Comic,Animation,Tv_series,Budget,Box_office,Run_time,Month
10,2021,Lupin,Mystery/Thriller,7.5,97,79,81,94,8.0,France,French,no,no,yes,0.0,0.0,40-52,January
13,2019,Reign of the Superman,Action/Superhero,6.7,93,74,83,73,7.0,USA,English,yes,yes,no,0.0,0.4223,87,February
31,2015,And Then There Were none,Mystery/Crime,7.8,80,91,82,87,7.0,UK,English,no,no,no,0.0,0.0,180,March
84,2022,The northman,Action/Drama,7.1,89,64,82,68,7.0,USA,English,no,no,no,80.0,69.6,136,June
108,2009,District 9,Sci-fi,7.9,90,82,81,88,7.0,USA,English,no,no,no,30.0,210.8,112,September
126,2011,Game of Thrones,Drama,9.2,89,85,86,92,9.0,USA,English,no,no,yes,0.0,0.0,50-82,October
149,2009,Drag me to hell,Horror/Supernatural,6.6,92,62,83,82,6.5,USA,English,no,no,no,30.0,90.8,99,november
163,2022,The Guardians of the galaxy holiday special,Superhero/Comedy,6.9,93,79,82,75,6.5,USA,English,yes,no,no,0.0,0.0,42,December
180,1963,The Great Escape,War/Action,8.2,94,95,86,86,8.0,USA,English,no,no,no,3.8,11.7,172,December


In [36]:
df[df['Google_user'] > 95]

Unnamed: 0,Year,Name,Genre,IMDB,Rotten Tomatoes,Rotten Audience,Metacritic,Google_user,PR,Country,Language,Comic,Animation,Tv_series,Budget,Box_office,Run_time,Month
3,2021,Operation Java,Crime/Drama,8.2,0,0,0,96,7.5,India,Malayalam,no,no,no,0.0,0.0,146,January
14,2019,The Mandalorian,Sci-fi/Drama,8.8,95,92,71,96,8.5,USA,English,yes,no,yes,0.0,0.0,30-50,February
28,2022,All of us are dead,Drama,7.6,87,79,67,96,8.0,South Korea,Korean,no,no,no,0.0,0.0,53-72,March
53,2022,Midnight At the Pera Palace,Historical Film/Drama,7.1,0,90,0,96,7.5,Turkish,Turkey,no,no,yes,0.0,0.0,45,April
56,2018,Karwan,Drama/Road,7.6,70,74,0,96,7.5,India,Hindi,no,no,no,230.0,264.2,114,April
103,2016,Stranger Things,Drama,8.7,92,90,74,96,8.5,USA,English,no,no,yes,270.0,0.0,42-150,September
134,2022,Diary,Thriller/Mystery,7.3,0,0,0,96,7.5,India,Tamil,no,no,no,0.0,0.0,132,October
161,2022,Sardar,Action/Spy,7.6,0,0,0,97,7.0,India,Tamil,no,no,no,0.0,100.0,165,December


In [37]:
# is there any documentary

df[df.Genre.str.contains('Documentary')]

Unnamed: 0,Year,Name,Genre,IMDB,Rotten Tomatoes,Rotten Audience,Metacritic,Google_user,PR,Country,Language,Comic,Animation,Tv_series,Budget,Box_office,Run_time,Month
65,2022,Return to Space,Documentary,7.3,79,78,56,86,7.0,USA,English,no,no,no,0.0,0.0,128,May


In [38]:
# how many tv-series, animation movies and comic based I watched

print('Total TV-Series:', len(df[df['Tv_series']=='yes']))
print('Total Animation Movies:', len(df[df['Animation']=='yes']))
print('Total Comic based Movies:', len(df[df['Comic']=='yes']))

Total TV-Series: 20
Total Animation Movies: 26
Total Comic based Movies: 30


In [39]:
# the minimum year and movie name

min_year = df.loc[df['Year'].idxmin()]
min_year['Name']

'The Great Escape'

### Insight

In [40]:
# I want to find the total number of movies released before 2000 and after 2000

before_2000 = len(df[df['Year'] < 2000])
after_2000 = len(df[df['Year'] >= 2000])
print('Before 2000:', before_2000)
print('After 2000:', after_2000)

Before 2000: 9
After 2000: 172


In [41]:
# I want to know how the movies have earned at the box office

df.insert(16, 'income', df['Box_office']-df['Budget'])

In [42]:
df[['Name','Budget','Box_office','income']].head()

Unnamed: 0,Name,Budget,Box_office,income
0,The Last Duel,100.0,30.6,-69.4
1,Ghostbusters: Afterlife,75.0,204.4,129.4
2,Hawkeye,150.0,0.0,-150.0
3,Operation Java,0.0,0.0,0.0
4,Eternals,200.0,402.0,202.0


In [44]:
df['profit'] = df.apply(lambda row: row['Box_office'] - row['Budget'] if row['Budget'] != 0 else 'budget is zero', axis=1)

In [45]:
df[['Name','Budget','Box_office','profit']].head(20)

Unnamed: 0,Name,Budget,Box_office,profit
0,The Last Duel,100.0,30.6,-69.4
1,Ghostbusters: Afterlife,75.0,204.4,129.4
2,Hawkeye,150.0,0.0,-150.0
3,Operation Java,0.0,0.0,budget is zero
4,Eternals,200.0,402.0,202.0
5,Johnny English Strikes Again,25.0,159.0,134.0
6,The Raid Redemption,1.1,9.3,8.2
7,The Witcher,0.0,0.0,budget is zero
8,The Mist,18.0,57.3,39.3
9,Only Murders in the Building,0.0,0.0,budget is zero


### I will create a new column where I will check whether the movie  is fresh or rotten based on Rotten Tomatoes

* well based on the internet, we know that in rotten tomatoes if a movie  gets 60% or more then it's fresh otherwise it's rotten. 


In [46]:
# method 1

def fresh_or_rotten(col):
    if col['Rotten Tomatoes'] >= 60:
        value='Fresh'
    else:
        value='Rotten'
    return value

df['freshness'] = df.apply(fresh_or_rotten, axis=1)

In [47]:
# method 2
# using lambda 

df['freshness'] = df['Rotten Tomatoes'].apply(lambda x: 'Fresh' if x >= 60 else 'Rotten')

In [48]:
df.insert(5, 'Freshness', df.pop('freshness'))
df.head()

Unnamed: 0,Year,Name,Genre,IMDB,Rotten Tomatoes,Freshness,Rotten Audience,Metacritic,Google_user,PR,...,Language,Comic,Animation,Tv_series,Budget,Box_office,income,Run_time,Month,profit
0,2021,The Last Duel,Drama/Historical Drama,7.4,85,Fresh,81,67,73,7.0,...,English,no,no,no,100.0,30.6,-69.4,153,January,-69.4
1,2021,Ghostbusters: Afterlife,Fantasy/Comedy,7.1,63,Fresh,94,45,76,6.5,...,English,no,no,no,75.0,204.4,129.4,124,January,129.4
2,2021,Hawkeye,Action/Superhero,7.5,88,Fresh,89,66,89,7.5,...,English,yes,no,yes,150.0,0.0,-150.0,40-62,January,-150.0
3,2021,Operation Java,Crime/Drama,8.2,0,Rotten,0,0,96,7.5,...,Malayalam,no,no,no,0.0,0.0,0.0,146,January,budget is zero
4,2021,Eternals,Action/Superhero,6.3,47,Rotten,77,52,71,6.5,...,English,yes,no,no,200.0,402.0,202.0,156,January,202.0


## Visualization

In [51]:
# movies seen year

alt.Chart(df, title='Yearly Distribution of Movies').mark_bar().encode(alt.X('Year',bin=alt.Bin(maxbins=20),
                                                                            axis=alt.Axis(titleFontSize=15)),
                                                                      alt.Y('count()', axis=alt.Axis(titleFontSize=15))
                                                                      ).configure_axis(grid=False, domain=True).interactive()