In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
# Handle the scrap data and organize according our need
df = pd.read_csv('movie_data.csv')

In [3]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,Title,Year,Genre,Certificate,Rating,Votes,Description,Lead,Director,Gross,Weather
0,0,Indiana Jones and the Dial of Destiny,(2023),"Action, Adventure",UA,6.9,68799,Archaeologist Indiana Jones races against time...,,James Mangold,,Rain
1,1,The Flash,(2023),"Action, Adventure, Fantasy",UA,7.1,87832,Barry Allen uses his super speed to change the...,,Andy Muschietti,,Rain
2,2,John Wick: Chapter 4,(2023),"Action, Crime, Thriller",A,7.9,239926,John Wick uncovers a path to defeating The Hig...,,Chad Stahelski,,Rain
3,3,Transformers: Rise of the Beasts,(2023),"Action, Adventure, Sci-Fi",UA,6.3,43176,"During the '90s, a new faction of Transformers...",,Steven Caple Jr.,,Rain
4,4,The Shawshank Redemption,(1994),Drama,A,9.3,2765336,"Over the course of several years, two convicts...",,Frank Darabont,$28.34M,Rain


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32326 entries, 0 to 32325
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Unnamed: 0   32326 non-null  int64  
 1   Title        32326 non-null  object 
 2   Year         31644 non-null  object 
 3   Genre        32209 non-null  object 
 4   Certificate  11954 non-null  object 
 5   Rating       29866 non-null  float64
 6   Votes        29867 non-null  object 
 7   Description  32326 non-null  object 
 8   Lead         0 non-null      float64
 9   Director     32062 non-null  object 
 10  Gross        10224 non-null  object 
 11  Weather      32326 non-null  object 
dtypes: float64(2), int64(1), object(9)
memory usage: 3.0+ MB


In [5]:
df.shape

(32326, 12)

In [6]:
df.duplicated().sum()

0

In [7]:
df.isnull().sum()

Unnamed: 0         0
Title              0
Year             682
Genre            117
Certificate    20372
Rating          2460
Votes           2459
Description        0
Lead           32326
Director         264
Gross          22102
Weather            0
dtype: int64

In [8]:
df.dtypes.value_counts()

object     9
float64    2
int64      1
dtype: int64

In [9]:
# lead column are null and in certificate column maximum item are missing so we dropped both
df.drop(['Lead', 'Certificate'], axis = 1, inplace = True)

In [10]:
df.drop(['Unnamed: 0'], axis = 1, inplace = True)

In [11]:
df.head()

Unnamed: 0,Title,Year,Genre,Rating,Votes,Description,Director,Gross,Weather
0,Indiana Jones and the Dial of Destiny,(2023),"Action, Adventure",6.9,68799,Archaeologist Indiana Jones races against time...,James Mangold,,Rain
1,The Flash,(2023),"Action, Adventure, Fantasy",7.1,87832,Barry Allen uses his super speed to change the...,Andy Muschietti,,Rain
2,John Wick: Chapter 4,(2023),"Action, Crime, Thriller",7.9,239926,John Wick uncovers a path to defeating The Hig...,Chad Stahelski,,Rain
3,Transformers: Rise of the Beasts,(2023),"Action, Adventure, Sci-Fi",6.3,43176,"During the '90s, a new faction of Transformers...",Steven Caple Jr.,,Rain
4,The Shawshank Redemption,(1994),Drama,9.3,2765336,"Over the course of several years, two convicts...",Frank Darabont,$28.34M,Rain


In [12]:
# genre, rating, votes are importent for us so we drop only missing item 
df.dropna(subset=["Genre", "Rating", "Votes"], inplace=True)

In [13]:
df.isnull().sum()

Title              0
Year               2
Genre              0
Rating             0
Votes              0
Description        0
Director           4
Gross          19591
Weather            0
dtype: int64

In [14]:
df.shape

(29814, 9)

In [15]:
# drop missing items
df.dropna(subset=["Director", "Year"], inplace=True)

In [16]:
df["Gross"].fillna('0', inplace=True)
df["Gross"].head()

0          0
1          0
2          0
3          0
4    $28.34M
Name: Gross, dtype: object

In [17]:
df.isnull().sum()

Title          0
Year           0
Genre          0
Rating         0
Votes          0
Description    0
Director       0
Gross          0
Weather        0
dtype: int64

In [18]:
df.shape

(29808, 9)

In [19]:
df['Weather'].shape

(29808,)

In [20]:
df.head()

Unnamed: 0,Title,Year,Genre,Rating,Votes,Description,Director,Gross,Weather
0,Indiana Jones and the Dial of Destiny,(2023),"Action, Adventure",6.9,68799,Archaeologist Indiana Jones races against time...,James Mangold,0,Rain
1,The Flash,(2023),"Action, Adventure, Fantasy",7.1,87832,Barry Allen uses his super speed to change the...,Andy Muschietti,0,Rain
2,John Wick: Chapter 4,(2023),"Action, Crime, Thriller",7.9,239926,John Wick uncovers a path to defeating The Hig...,Chad Stahelski,0,Rain
3,Transformers: Rise of the Beasts,(2023),"Action, Adventure, Sci-Fi",6.3,43176,"During the '90s, a new faction of Transformers...",Steven Caple Jr.,0,Rain
4,The Shawshank Redemption,(1994),Drama,9.3,2765336,"Over the course of several years, two convicts...",Frank Darabont,$28.34M,Rain


In [21]:
# edit the year column, remove parantheses and extract only value 
def extract_year(year_str):
    match = re.search(r"\b(\d{4})\b", year_str)
    if match:
        year = int(match.group(1))
        return pd.to_datetime(str(year), format="%Y").date().year
    else:
        return None

In [22]:
df["Year"] = df["Year"].apply(lambda x: extract_year(str(x).strip("()")) if pd.notnull(x) else None)

In [23]:
df.dtypes

Title           object
Year             int64
Genre           object
Rating         float64
Votes           object
Description     object
Director        object
Gross           object
Weather         object
dtype: object

In [24]:
df.head(3)

Unnamed: 0,Title,Year,Genre,Rating,Votes,Description,Director,Gross,Weather
0,Indiana Jones and the Dial of Destiny,2023,"Action, Adventure",6.9,68799,Archaeologist Indiana Jones races against time...,James Mangold,0,Rain
1,The Flash,2023,"Action, Adventure, Fantasy",7.1,87832,Barry Allen uses his super speed to change the...,Andy Muschietti,0,Rain
2,John Wick: Chapter 4,2023,"Action, Crime, Thriller",7.9,239926,John Wick uncovers a path to defeating The Hig...,Chad Stahelski,0,Rain


In [25]:
# change the data type of votes from object to int
df['Votes'] = df['Votes'].astype(int)

In [26]:
df.dtypes

Title           object
Year             int64
Genre           object
Rating         float64
Votes            int32
Description     object
Director        object
Gross           object
Weather         object
dtype: object

In [27]:
df.sample(5)

Unnamed: 0,Title,Year,Genre,Rating,Votes,Description,Director,Gross,Weather
5850,Happî awâ,2015,Drama,7.6,2679,A slow-burning epic chronicling the emotional ...,Ryûsuke Hamaguchi,0,Clouds
22396,Epic Movie,2007,"Adventure, Comedy, Fantasy",2.4,107909,A spoof on previous years' epic movies (The Da...,Jason Friedberg,$39.74M,Snow
8580,"Il cavaliere, la morte e il diavolo",1983,"Horror, Mystery",6.3,24,A family of the three fall into a dark and dis...,Beppe Cino,0,Clouds
22008,Om Abbes,1970,"Crime, Drama",8.2,9,"Abbes, the best rider of the village, has fall...",Ali Abdelwahab,0,Thunderstorm
30665,Attila,1954,"Biography, Drama, History",5.5,762,"Attila, the leader of the barbarian Huns and c...",Pietro Francisci,0,Mist


In [28]:
# remove the $ and unit value from gross and change type to float
df['Gross'] = df['Gross'].str.replace('$', '')
df['Gross'] = df['Gross'].str.replace('M', '')

  df['Gross'] = df['Gross'].str.replace('$', '')


In [29]:
df.sample(5)

Unnamed: 0,Title,Year,Genre,Rating,Votes,Description,Director,Gross,Weather
31949,Run Sweetheart Run,2022,"Horror, Thriller",5.5,6246,"After what seems to be an innocent date, Cheri...",Shana Feste,0.0,Fog
17627,La ciudad y los perros,1985,Drama,7.4,684,Four angry cadets form an inner circle to beat...,Francisco J. Lombardi,0.0,Thunderstorm
31180,Ask the Dust,2006,"Drama, Romance",5.7,9920,Mexican beauty Camilla Lopez (Salma Hayek) hop...,Robert Towne,0.74,Fog
115,Bohemian Rhapsody,2018,"Biography, Drama, Music",7.9,559824,The story of the legendary British rock band Q...,Bryan Singer,216.43,Rain
6182,Nero,1992,"Comedy, Crime",6.0,152,"Right after she moves in with him, Frederico's...",Giancarlo Soldi,0.0,Clouds


In [30]:
df['Gross'] = df['Gross'].astype(float) * 100000

In [31]:
df.dtypes

Title           object
Year             int64
Genre           object
Rating         float64
Votes            int32
Description     object
Director        object
Gross          float64
Weather         object
dtype: object

In [32]:
df.sample(5)

Unnamed: 0,Title,Year,Genre,Rating,Votes,Description,Director,Gross,Weather
13047,Straight Outta Compton,2015,"Biography, Drama, History",7.8,210280,The rap group NWA emerges from the mean street...,F. Gary Gray,16120000.0,Thunderstorm
10136,A Nightmare on Elm Street 4: The Dream Master,1988,Horror,5.6,58794,Freddy Krueger returns once again to terrorize...,Renny Harlin,4937000.0,Thunderstorm
2101,Cosi,1996,"Comedy, Drama, Music",6.6,2170,Lewis is a young Sydney amateur theater direct...,Mark Joffe,0.0,Rain
23302,Eddie and the Cruisers II: Eddie Lives!,1989,"Drama, Music, Romance",6.2,3143,A Montreal construction worker joins a band an...,Jean-Claude Lord,54000.0,Snow
14164,Slaughterhouse-Five,1972,"Comedy, Drama, Sci-Fi",6.8,13536,Billy Pilgrim has mysteriously become unstuck ...,George Roy Hill,57000.0,Thunderstorm


In [33]:
# Based on 8 different weather information in the json file in the incoming Weather API
# keywords of the created seasons

spring_keywords = ["Rain", "Drizzle", "Clouds", "Fog"]
summer_keywords = ["Clear", "Thunderstorm", "Clouds"]
autumn_keywords = ["Rain", "Mist", "Drizzle", "Fog", "Clouds"]
winter_keywords = ["Snow", "Mist", "Fog", "Clouds"]

In [58]:


def get_season(dataframe, weather_col, season_col, season_list, spring_key, summer_key, autumn_key, winter_key):
    
    ''' dataframe : related dataframe,
        weather_col : weather column,
        season_col : assigned season column,
        season_list : desired season,
        spring_key : spring keyword,
        summer_key : summer keyword,
        autumn_key : autumn keyword,
        winter_key : winter keyword
    '''
    
    conditions = [
        dataframe[weather_col].isin(spring_key),
        dataframe[weather_col].isin(summer_key),
        dataframe[weather_col].isin(autumn_key),
        dataframe[weather_col].isin(winter_key)
    ]
    
    choices = [season_list[0:1], season_list[1:2], season_list[2:3], season_list[3:4]]
    
    dataframe[season_col] = np.select(conditions, choices, default='')
    
    return dataframe

get_season(df, "Weather", "Season", ["Spring", "Summer", "Autumn", "Winter"],
            spring_keywords, summer_keywords, autumn_keywords, winter_keywords)


# Make sure to replace df, "Weather", "Season", spring_keywords, summer_keywords, autumn_keywords, and winter_keywords with actual values when calling the function.


Unnamed: 0,Title,Year,Genre,Rating,Votes,Description,Director,Gross,Weather,Season
0,Indiana Jones and the Dial of Destiny,2023,"Action, Adventure",6.9,68799,Archaeologist Indiana Jones races against time...,James Mangold,0.0,Rain,Spring
1,The Flash,2023,"Action, Adventure, Fantasy",7.1,87832,Barry Allen uses his super speed to change the...,Andy Muschietti,0.0,Rain,Spring
2,John Wick: Chapter 4,2023,"Action, Crime, Thriller",7.9,239926,John Wick uncovers a path to defeating The Hig...,Chad Stahelski,0.0,Rain,Spring
3,Transformers: Rise of the Beasts,2023,"Action, Adventure, Sci-Fi",6.3,43176,"During the '90s, a new faction of Transformers...",Steven Caple Jr.,0.0,Rain,Spring
4,The Shawshank Redemption,1994,Drama,9.3,2765336,"Over the course of several years, two convicts...",Frank Darabont,2834000.0,Rain,Spring
...,...,...,...,...,...,...,...,...,...,...
32301,Akela the Alone,2014,Thriller,6.6,11,"A loner, whose parents died years ago, lives a...",Abrar,0.0,Fog,Spring
32303,Gu dao jing hun 2,2013,"Drama, Fantasy, Horror",2.3,26,"A woman who is having a stay in Bangkok, whose...",Kai-Cheung Chung,0.0,Fog,Spring
32315,"Chelovek, kotoryy molchal",2004,Thriller,5.8,34,Add a Plot,Pavel Ruminov,0.0,Fog,Spring
32322,Return of the Curse,2006,Horror,1.7,20,A dark independent rooted in the paranormal. T...,Chris Penney,0.0,Fog,Spring


In [41]:
df['Weather'].value_counts()

Thunderstorm    11004
Snow             7645
Rain             4301
Clouds           3446
Fog              1231
Clear            1110
Mist              706
Drizzle           365
Name: Weather, dtype: int64

In [60]:
df['Season'].value_counts()

Summer    12114
Spring     9343
Winter     7645
Autumn      706
Name: Season, dtype: int64

In [61]:
# we need to best movie whose rating high
df_movie = df[df['Rating'] >7.5]

In [62]:
df_movie.shape

(3266, 10)

In [63]:
df_movie[["Season", "Weather"]].value_counts()

Season  Weather     
Summer  Thunderstorm    912
Winter  Snow            852
Spring  Rain            570
        Clouds          419
Summer  Clear           208
Spring  Fog             148
Autumn  Mist             79
Spring  Drizzle          78
dtype: int64

In [64]:
unnecessary_categories = ("Documentary", "Short", "Animation", "Reality-TV",
                          "Game-Show", "Game", "Music", "Talk-Show")


movie_df_ = df_movie[~(df_movie["Genre"].str.startswith(unnecessary_categories))]

movie_df_.shape

(2623, 10)

In [65]:
movie_df_.dtypes

Title           object
Year             int64
Genre           object
Rating         float64
Votes            int32
Description     object
Director        object
Gross          float64
Weather         object
Season          object
dtype: object

In [66]:
# we keep only movie which is getting 5000 and above votes
movie_df_votes = movie_df_[movie_df_['Votes'] > 5000]

In [67]:
movie_df_votes.shape

(1873, 10)

In [68]:
movie_df_votes.Weather.value_counts()

Thunderstorm    533
Snow            482
Rain            396
Clouds          183
Fog             110
Clear           101
Mist             50
Drizzle          18
Name: Weather, dtype: int64

In [69]:
movie_df_votes = movie_df_votes.sort_values(by="Rating", ascending=False)

In [70]:
movie_df_votes = movie_df_votes.reset_index()

In [54]:
movie_df_votes.to_csv('imdb_movies.csv')

In [71]:
movie_df_votes.head()

Unnamed: 0,index,Title,Year,Genre,Rating,Votes,Description,Director,Gross,Weather,Season
0,4,The Shawshank Redemption,1994,Drama,9.3,2765336,"Over the course of several years, two convicts...",Frank Darabont,2834000.0,Rain,Spring
1,12560,The Shawshank Redemption,1994,Drama,9.3,2765336,"Over the course of several years, two convicts...",Frank Darabont,2834000.0,Thunderstorm,Summer
2,26829,The Godfather,1972,"Crime, Drama",9.2,1924588,"Don Vito Corleone, head of a mafia family, dec...",Francis Ford Coppola,13497000.0,Snow,Winter
3,4022,The Godfather,1972,"Crime, Drama",9.2,1924556,"Don Vito Corleone, head of a mafia family, dec...",Francis Ford Coppola,13497000.0,Rain,Spring
4,12561,The Godfather,1972,"Crime, Drama",9.2,1924556,"Don Vito Corleone, head of a mafia family, dec...",Francis Ford Coppola,13497000.0,Thunderstorm,Summer


In [72]:
movie_df_votes.shape

(1873, 11)